go-archive: Go Coverage Report

// Package archive provides helper functions for dealing with archive files.
package archive

import (
        "archive/tar"
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "runtime"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/log"
        "github.com/moby/patternmatcher"
        "github.com/moby/sys/sequential"
        "github.com/moby/sys/user"

        "github.com/moby/go-archive/compression"
        "github.com/moby/go-archive/tarheader"
)

// ImpliedDirectoryMode represents the mode (Unix permissions) applied to directories that are implied by files in a
// tar, but that do not have their own header entry.
//
// The permissions mask is stored in a constant instead of locally to ensure that magic numbers do not
// proliferate in the codebase. The default value 0755 has been selected based on the default umask of 0022, and
// a convention of mkdir(1) calling mkdir(2) with permissions of 0777, resulting in a final value of 0755.
//
// This value is currently implementation-defined, and not captured in any cross-runtime specification. Thus, it is
// subject to change in Moby at any time -- image authors who require consistent or known directory permissions
// should explicitly control them by ensuring that header entries exist for any applicable path.
const ImpliedDirectoryMode = 0o755

type (
        // WhiteoutFormat is the format of whiteouts unpacked
        WhiteoutFormat int

        ChownOpts struct {
                UID int
                GID int
        }

        // TarOptions wraps the tar options.
        TarOptions struct {
                IncludeFiles     []string
                ExcludePatterns  []string
                Compression      compression.Compression
                NoLchown         bool
                IDMap            user.IdentityMapping
                ChownOpts        *ChownOpts
                IncludeSourceDir bool
                // WhiteoutFormat is the expected on disk format for whiteout files.
                // This format will be converted to the standard format on pack
                // and from the standard format on unpack.
                WhiteoutFormat WhiteoutFormat
                // When unpacking, specifies whether overwriting a directory with a
                // non-directory is allowed and vice versa.
                NoOverwriteDirNonDir bool
                // For each include when creating an archive, the included name will be
                // replaced with the matching name from this map.
                RebaseNames map[string]string
                InUserNS    bool
                // Allow unpacking to succeed in spite of failures to set extended
                // attributes on the unpacked files due to the destination filesystem
                // not supporting them or a lack of permissions. Extended attributes
                // were probably in the archive for a reason, so set this option at
                // your own peril.
                BestEffortXattrs bool
        }
)

// Archiver implements the Archiver interface and allows the reuse of most utility functions of
// this package with a pluggable Untar function. Also, to facilitate the passing of specific id
// mappings for untar, an Archiver can be created with maps which will then be passed to Untar operations.
type Archiver struct {
        Untar     func(io.Reader, string, *TarOptions) error
        IDMapping user.IdentityMapping
}

// NewDefaultArchiver returns a new Archiver without any IdentityMapping
func NewDefaultArchiver() *Archiver {
        return &Archiver{Untar: Untar}
}

// breakoutError is used to differentiate errors related to breaking out
// When testing archive breakout in the unit tests, this error is expected
// in order for the test to pass.
type breakoutError error

const (
        AUFSWhiteoutFormat    WhiteoutFormat = 0 // AUFSWhiteoutFormat is the default format for whiteouts
        OverlayWhiteoutFormat WhiteoutFormat = 1 // OverlayWhiteoutFormat formats whiteout according to the overlay standard.
)

// IsArchivePath checks if the (possibly compressed) file at the given path
// starts with a tar file header.
func IsArchivePath(path string) bool {
        file, err := os.Open(path)
        if err != nil {
                return false
        }
        defer file.Close()
        rdr, err := compression.DecompressStream(file)
        if err != nil {
                return false
        }
        defer rdr.Close()
        r := tar.NewReader(rdr)
        _, err = r.Next()
        return err == nil
}

// TarModifierFunc is a function that can be passed to ReplaceFileTarWrapper to
// modify the contents or header of an entry in the archive. If the file already
// exists in the archive the TarModifierFunc will be called with the Header and
// a reader which will return the files content. If the file does not exist both
// header and content will be nil.
type TarModifierFunc func(path string, header *tar.Header, content io.Reader) (*tar.Header, []byte, error)

// ReplaceFileTarWrapper converts inputTarStream to a new tar stream. Files in the
// tar stream are modified if they match any of the keys in mods.
func ReplaceFileTarWrapper(inputTarStream io.ReadCloser, mods map[string]TarModifierFunc) io.ReadCloser {
        pipeReader, pipeWriter := io.Pipe()

        go func() {
                tarReader := tar.NewReader(inputTarStream)
                tarWriter := tar.NewWriter(pipeWriter)
                defer inputTarStream.Close()
                defer tarWriter.Close()

                modify := func(name string, original *tar.Header, modifier TarModifierFunc, tarReader io.Reader) error {
                        header, data, err := modifier(name, original, tarReader)
                        switch {
                        case err != nil:
                                return err
                        case header == nil:
                                return nil
                        }

                        if header.Name == "" {
                                header.Name = name
                        }
                        header.Size = int64(len(data))
                        if err := tarWriter.WriteHeader(header); err != nil {
                                return err
                        }
                        if len(data) != 0 {
                                if _, err := tarWriter.Write(data); err != nil {
                                        return err
                                }
                        }
                        return nil
                }

                var err error
                var originalHeader *tar.Header
                for {
                        originalHeader, err = tarReader.Next()
                        if errors.Is(err, io.EOF) {
                                break
                        }
                        if err != nil {
                                pipeWriter.CloseWithError(err)
                                return
                        }

                        modifier, ok := mods[originalHeader.Name]
                        if !ok {
                                // No modifiers for this file, copy the header and data
                                if err := tarWriter.WriteHeader(originalHeader); err != nil {
                                        pipeWriter.CloseWithError(err)
                                        return
                                }
                                if err := copyWithBuffer(tarWriter, tarReader); err != nil {
                                        pipeWriter.CloseWithError(err)
                                        return
                                }
                                continue
                        }
                        delete(mods, originalHeader.Name)

                        if err := modify(originalHeader.Name, originalHeader, modifier, tarReader); err != nil {
                                pipeWriter.CloseWithError(err)
                                return
                        }
                }

                // Apply the modifiers that haven't matched any files in the archive
                for name, modifier := range mods {
                        if err := modify(name, nil, modifier, nil); err != nil {
                                pipeWriter.CloseWithError(err)
                                return
                        }
                }

                pipeWriter.Close()
        }()
        return pipeReader
}

// FileInfoHeader creates a populated Header from fi.
//
// Compared to the archive/tar package, this function fills in less information
// but is safe to call from a chrooted process. The AccessTime and ChangeTime
// fields are not set in the returned header, ModTime is truncated to one-second
// precision, and the Uname and Gname fields are only set when fi is a FileInfo
// value returned from tar.Header.FileInfo().
func FileInfoHeader(name string, fi os.FileInfo, link string) (*tar.Header, error) {
        hdr, err := tarheader.FileInfoHeaderNoLookups(fi, link)
        if err != nil {
                return nil, err
        }
        hdr.Format = tar.FormatPAX
        hdr.ModTime = hdr.ModTime.Truncate(time.Second)
        hdr.AccessTime = time.Time{}
        hdr.ChangeTime = time.Time{}
        hdr.Mode = int64(chmodTarEntry(os.FileMode(hdr.Mode)))
        hdr.Name = canonicalTarName(name, fi.IsDir())
        return hdr, nil
}

const paxSchilyXattr = "SCHILY.xattr."

// ReadSecurityXattrToTarHeader reads security.capability xattr from filesystem
// to a tar header
func ReadSecurityXattrToTarHeader(path string, hdr *tar.Header) error {
        const (
                // Values based on linux/include/uapi/linux/capability.h
                xattrCapsSz2    = 20
                versionOffset   = 3
                vfsCapRevision2 = 2
                vfsCapRevision3 = 3
        )
        capability, _ := lgetxattr(path, "security.capability")
        if capability != nil {
                if capability[versionOffset] == vfsCapRevision3 {
                        // Convert VFS_CAP_REVISION_3 to VFS_CAP_REVISION_2 as root UID makes no
                        // sense outside the user namespace the archive is built in.
                        capability[versionOffset] = vfsCapRevision2
                        capability = capability[:xattrCapsSz2]
                }
                if hdr.PAXRecords == nil {
                        hdr.PAXRecords = make(map[string]string)
                }
                hdr.PAXRecords[paxSchilyXattr+"security.capability"] = string(capability)
        }
        return nil
}

type tarWhiteoutConverter interface {
        ConvertWrite(*tar.Header, string, os.FileInfo) (*tar.Header, error)
        ConvertRead(*tar.Header, string) (bool, error)
}

type tarAppender struct {
        TarWriter *tar.Writer

        // for hardlink mapping
        SeenFiles       map[uint64]string
        IdentityMapping user.IdentityMapping
        ChownOpts       *ChownOpts

        // For packing and unpacking whiteout files in the
        // non standard format. The whiteout files defined
        // by the AUFS standard are used as the tar whiteout
        // standard.
        WhiteoutConverter tarWhiteoutConverter
}

func newTarAppender(idMapping user.IdentityMapping, writer io.Writer, chownOpts *ChownOpts) *tarAppender {
        return &tarAppender{
                SeenFiles:       make(map[uint64]string),
                TarWriter:       tar.NewWriter(writer),
                IdentityMapping: idMapping,
                ChownOpts:       chownOpts,
        }
}

// canonicalTarName provides a platform-independent and consistent POSIX-style
// path for files and directories to be archived regardless of the platform.
func canonicalTarName(name string, isDir bool) string {
        name = filepath.ToSlash(name)

        // suffix with '/' for directories
        if isDir && !strings.HasSuffix(name, "/") {
                name += "/"
        }
        return name
}

// addTarFile adds to the tar archive a file from `path` as `name`
func (ta *tarAppender) addTarFile(path, name string) error {
        fi, err := os.Lstat(path)
        if err != nil {
                return err
        }

        var link string
        if fi.Mode()&os.ModeSymlink != 0 {
                var err error
                link, err = os.Readlink(path)
                if err != nil {
                        return err
                }
        }

        hdr, err := FileInfoHeader(name, fi, link)
        if err != nil {
                return err
        }
        if err := ReadSecurityXattrToTarHeader(path, hdr); err != nil {
                return err
        }

        // if it's not a directory and has more than 1 link,
        // it's hard linked, so set the type flag accordingly
        if !fi.IsDir() && hasHardlinks(fi) {
                inode, err := getInodeFromStat(fi.Sys())
                if err != nil {
                        return err
                }
                // a link should have a name that it links too
                // and that linked name should be first in the tar archive
                if oldpath, ok := ta.SeenFiles[inode]; ok {
                        hdr.Typeflag = tar.TypeLink
                        hdr.Linkname = oldpath
                        hdr.Size = 0 // This Must be here for the writer math to add up!
                } else {
                        ta.SeenFiles[inode] = name
                }
        }

        // check whether the file is overlayfs whiteout
        // if yes, skip re-mapping container ID mappings.
        isOverlayWhiteout := fi.Mode()&os.ModeCharDevice != 0 && hdr.Devmajor == 0 && hdr.Devminor == 0

        // handle re-mapping container ID mappings back to host ID mappings before
        // writing tar headers/files. We skip whiteout files because they were written
        // by the kernel and already have proper ownership relative to the host
        if !isOverlayWhiteout && !strings.HasPrefix(filepath.Base(hdr.Name), WhiteoutPrefix) && !ta.IdentityMapping.Empty() {
                uid, gid, err := getFileUIDGID(fi.Sys())
                if err != nil {
                        return err
                }
                hdr.Uid, hdr.Gid, err = ta.IdentityMapping.ToContainer(uid, gid)
                if err != nil {
                        return err
                }
        }

        // explicitly override with ChownOpts
        if ta.ChownOpts != nil {
                hdr.Uid = ta.ChownOpts.UID
                hdr.Gid = ta.ChownOpts.GID
        }

        if ta.WhiteoutConverter != nil {
                wo, err := ta.WhiteoutConverter.ConvertWrite(hdr, path, fi)
                if err != nil {
                        return err
                }

                // If a new whiteout file exists, write original hdr, then
                // replace hdr with wo to be written after. Whiteouts should
                // always be written after the original. Note the original
                // hdr may have been updated to be a whiteout with returning
                // a whiteout header
                if wo != nil {
                        if err := ta.TarWriter.WriteHeader(hdr); err != nil {
                                return err
                        }
                        if hdr.Typeflag == tar.TypeReg && hdr.Size > 0 {
                                return fmt.Errorf("tar: cannot use whiteout for non-empty file")
                        }
                        hdr = wo
                }
        }

        if err := ta.TarWriter.WriteHeader(hdr); err != nil {
                return err
        }

        if hdr.Typeflag == tar.TypeReg && hdr.Size > 0 {
                // We use sequential file access to avoid depleting the standby list on
                // Windows. On Linux, this equates to a regular os.Open.
                file, err := sequential.Open(path)
                if err != nil {
                        return err
                }

                err = copyWithBuffer(ta.TarWriter, file)
                file.Close()
                if err != nil {
                        return err
                }
        }

        return nil
}

func createTarFile(path, extractDir string, hdr *tar.Header, reader io.Reader, opts *TarOptions) error {
        var (
                Lchown                     = true
                inUserns, bestEffortXattrs bool
                chownOpts                  *ChownOpts
        )

        // TODO(thaJeztah): make opts a required argument.
        if opts != nil {
                Lchown = !opts.NoLchown
                inUserns = opts.InUserNS // TODO(thaJeztah): consider deprecating opts.InUserNS and detect locally.
                chownOpts = opts.ChownOpts
                bestEffortXattrs = opts.BestEffortXattrs
        }

        // hdr.Mode is in linux format, which we can use for sycalls,
        // but for os.Foo() calls we need the mode converted to os.FileMode,
        // so use hdrInfo.Mode() (they differ for e.g. setuid bits)
        hdrInfo := hdr.FileInfo()

        switch hdr.Typeflag {
        case tar.TypeDir:
                // Create directory unless it exists as a directory already.
                // In that case we just want to merge the two
                if fi, err := os.Lstat(path); err != nil || !fi.IsDir() {
                        if err := os.Mkdir(path, hdrInfo.Mode()); err != nil {
                                return err
                        }
                }

        case tar.TypeReg:
                // Source is regular file. We use sequential file access to avoid depleting
                // the standby list on Windows. On Linux, this equates to a regular os.OpenFile.
                file, err := sequential.OpenFile(path, os.O_CREATE|os.O_WRONLY, hdrInfo.Mode())
                if err != nil {
                        return err
                }
                if err := copyWithBuffer(file, reader); err != nil {
                        _ = file.Close()
                        return err
                }
                _ = file.Close()

        case tar.TypeBlock, tar.TypeChar:
                if inUserns { // cannot create devices in a userns
                        log.G(context.TODO()).WithFields(log.Fields{"path": path, "type": hdr.Typeflag}).Debug("skipping device nodes in a userns")
                        return nil
                }
                // Handle this is an OS-specific way
                if err := handleTarTypeBlockCharFifo(hdr, path); err != nil {
                        return err
                }

        case tar.TypeFifo:
                // Handle this is an OS-specific way
                if err := handleTarTypeBlockCharFifo(hdr, path); err != nil {
                        if inUserns && errors.Is(err, syscall.EPERM) {
                                // In most cases, cannot create a fifo if running in user namespace
                                log.G(context.TODO()).WithFields(log.Fields{"error": err, "path": path, "type": hdr.Typeflag}).Debug("creating fifo node in a userns")
                                return nil
                        }
                        return err
                }

        case tar.TypeLink:
                // #nosec G305 -- The target path is checked for path traversal.
                targetPath := filepath.Join(extractDir, hdr.Linkname)
                // check for hardlink breakout
                if !strings.HasPrefix(targetPath, extractDir) {
                        return breakoutError(fmt.Errorf("invalid hardlink %q -> %q", targetPath, hdr.Linkname))
                }
                if err := os.Link(targetPath, path); err != nil {
                        return err
                }

        case tar.TypeSymlink:
                //         path                                 -> hdr.Linkname = targetPath
                // e.g. /extractDir/path/to/symlink         -> ../2/file        = /extractDir/path/2/file
                targetPath := filepath.Join(filepath.Dir(path), hdr.Linkname) // #nosec G305 -- The target path is checked for path traversal.

                // the reason we don't need to check symlinks in the path (with FollowSymlinkInScope) is because
                // that symlink would first have to be created, which would be caught earlier, at this very check:
                if !strings.HasPrefix(targetPath, extractDir) {
                        return breakoutError(fmt.Errorf("invalid symlink %q -> %q", path, hdr.Linkname))
                }
                if err := os.Symlink(hdr.Linkname, path); err != nil {
                        return err
                }

        case tar.TypeXGlobalHeader:
                log.G(context.TODO()).Debug("PAX Global Extended Headers found and ignored")
                return nil

        default:
                return fmt.Errorf("unhandled tar header type %d", hdr.Typeflag)
        }

        // Lchown is not supported on Windows.
        if Lchown && runtime.GOOS != "windows" {
                if chownOpts == nil {
                        chownOpts = &ChownOpts{UID: hdr.Uid, GID: hdr.Gid}
                }
                if err := os.Lchown(path, chownOpts.UID, chownOpts.GID); err != nil {
                        var msg string
                        if inUserns && errors.Is(err, syscall.EINVAL) {
                                msg = " (try increasing the number of subordinate IDs in /etc/subuid and /etc/subgid)"
                        }
                        return fmt.Errorf("failed to Lchown %q for UID %d, GID %d%s: %w", path, hdr.Uid, hdr.Gid, msg, err)
                }
        }

        var xattrErrs []string
        for key, value := range hdr.PAXRecords {
                xattr, ok := strings.CutPrefix(key, paxSchilyXattr)
                if !ok {
                        continue
                }
                if err := lsetxattr(path, xattr, []byte(value), 0); err != nil {
                        if bestEffortXattrs && errors.Is(err, syscall.ENOTSUP) || errors.Is(err, syscall.EPERM) {
                                // EPERM occurs if modifying xattrs is not allowed. This can
                                // happen when running in userns with restrictions (ChromeOS).
                                xattrErrs = append(xattrErrs, err.Error())
                                continue
                        }
                        return err
                }
        }

        if len(xattrErrs) > 0 {
                log.G(context.TODO()).WithFields(log.Fields{
                        "errors": xattrErrs,
                }).Warn("ignored xattrs in archive: underlying filesystem doesn't support them")
        }

        // There is no LChmod, so ignore mode for symlink. Also, this
        // must happen after chown, as that can modify the file mode
        if err := handleLChmod(hdr, path, hdrInfo); err != nil {
                return err
        }

        aTime := boundTime(latestTime(hdr.AccessTime, hdr.ModTime))
        mTime := boundTime(hdr.ModTime)

        // chtimes doesn't support a NOFOLLOW flag atm
        if hdr.Typeflag == tar.TypeLink {
                if fi, err := os.Lstat(hdr.Linkname); err == nil && (fi.Mode()&os.ModeSymlink == 0) {
                        if err := chtimes(path, aTime, mTime); err != nil {
                                return err
                        }
                }
        } else if hdr.Typeflag != tar.TypeSymlink {
                if err := chtimes(path, aTime, mTime); err != nil {
                        return err
                }
        } else {
                if err := lchtimes(path, aTime, mTime); err != nil {
                        return err
                }
        }
        return nil
}

// Tar creates an archive from the directory at `path`, and returns it as a
// stream of bytes.
func Tar(path string, comp compression.Compression) (io.ReadCloser, error) {
        return TarWithOptions(path, &TarOptions{Compression: comp})
}

// TarWithOptions creates an archive from the directory at `path`, only including files whose relative
// paths are included in `options.IncludeFiles` (if non-nil) or not in `options.ExcludePatterns`.
func TarWithOptions(srcPath string, options *TarOptions) (io.ReadCloser, error) {
        tb, err := NewTarballer(srcPath, options)
        if err != nil {
                return nil, err
        }
        go tb.Do()
        return tb.Reader(), nil
}

// Tarballer is a lower-level interface to TarWithOptions which gives the caller
// control over which goroutine the archiving operation executes on.
type Tarballer struct {
        srcPath           string
        options           *TarOptions
        pm                *patternmatcher.PatternMatcher
        pipeReader        *io.PipeReader
        pipeWriter        *io.PipeWriter
        compressWriter    io.WriteCloser
        whiteoutConverter tarWhiteoutConverter
}

// NewTarballer constructs a new tarballer. The arguments are the same as for
// TarWithOptions.
func NewTarballer(srcPath string, options *TarOptions) (*Tarballer, error) {
        pm, err := patternmatcher.New(options.ExcludePatterns)
        if err != nil {
                return nil, err
        }

        pipeReader, pipeWriter := io.Pipe()

        compressWriter, err := compression.CompressStream(pipeWriter, options.Compression)
        if err != nil {
                return nil, err
        }

        return &Tarballer{
                // Fix the source path to work with long path names. This is a no-op
                // on platforms other than Windows.
                srcPath:           addLongPathPrefix(srcPath),
                options:           options,
                pm:                pm,
                pipeReader:        pipeReader,
                pipeWriter:        pipeWriter,
                compressWriter:    compressWriter,
                whiteoutConverter: getWhiteoutConverter(options.WhiteoutFormat),
        }, nil
}

// Reader returns the reader for the created archive.
func (t *Tarballer) Reader() io.ReadCloser {
        return t.pipeReader
}

// Do performs the archiving operation in the background. The resulting archive
// can be read from t.Reader(). Do should only be called once on each Tarballer
// instance.
func (t *Tarballer) Do() {
        ta := newTarAppender(
                t.options.IDMap,
                t.compressWriter,
                t.options.ChownOpts,
        )
        ta.WhiteoutConverter = t.whiteoutConverter

        defer func() {
                // Make sure to check the error on Close.
                if err := ta.TarWriter.Close(); err != nil {
                        log.G(context.TODO()).Errorf("Can't close tar writer: %s", err)
                }
                if err := t.compressWriter.Close(); err != nil {
                        log.G(context.TODO()).Errorf("Can't close compress writer: %s", err)
                }
                if err := t.pipeWriter.Close(); err != nil {
                        log.G(context.TODO()).Errorf("Can't close pipe writer: %s", err)
                }
        }()

        // In general we log errors here but ignore them because
        // during e.g. a diff operation the container can continue
        // mutating the filesystem and we can see transient errors
        // from this

        stat, err := os.Lstat(t.srcPath)
        if err != nil {
                return
        }

        if !stat.IsDir() {
                // We can't later join a non-dir with any includes because the
                // 'walk' will error if "file/." is stat-ed and "file" is not a
                // directory. So, we must split the source path and use the
                // basename as the include.
                if len(t.options.IncludeFiles) > 0 {
                        log.G(context.TODO()).Warn("Tar: Can't archive a file with includes")
                }

                dir, base := SplitPathDirEntry(t.srcPath)
                t.srcPath = dir
                t.options.IncludeFiles = []string{base}
        }

        if len(t.options.IncludeFiles) == 0 {
                t.options.IncludeFiles = []string{"."}
        }

        seen := make(map[string]bool)

        for _, include := range t.options.IncludeFiles {
                rebaseName := t.options.RebaseNames[include]

                var (
                        parentMatchInfo []patternmatcher.MatchInfo
                        parentDirs      []string
                )

                walkRoot := getWalkRoot(t.srcPath, include)
                // TODO(thaJeztah): should this error be handled?
                _ = filepath.WalkDir(walkRoot, func(filePath string, f os.DirEntry, err error) error {
                        if err != nil {
                                log.G(context.TODO()).Errorf("Tar: Can't stat file %s to tar: %s", t.srcPath, err)
                                return nil
                        }

                        relFilePath, err := filepath.Rel(t.srcPath, filePath)
                        if err != nil || (!t.options.IncludeSourceDir && relFilePath == "." && f.IsDir()) {
                                // Error getting relative path OR we are looking
                                // at the source directory path. Skip in both situations.
                                return nil
                        }

                        if t.options.IncludeSourceDir && include == "." && relFilePath != "." {
                                relFilePath = strings.Join([]string{".", relFilePath}, string(filepath.Separator))
                        }

                        skip := false

                        // If "include" is an exact match for the current file
                        // then even if there's an "excludePatterns" pattern that
                        // matches it, don't skip it. IOW, assume an explicit 'include'
                        // is asking for that file no matter what - which is true
                        // for some files, like .dockerignore and Dockerfile (sometimes)
                        if include != relFilePath {
                                for len(parentDirs) != 0 {
                                        lastParentDir := parentDirs[len(parentDirs)-1]
                                        if strings.HasPrefix(relFilePath, lastParentDir+string(os.PathSeparator)) {
                                                break
                                        }
                                        parentDirs = parentDirs[:len(parentDirs)-1]
                                        parentMatchInfo = parentMatchInfo[:len(parentMatchInfo)-1]
                                }

                                var matchInfo patternmatcher.MatchInfo
                                if len(parentMatchInfo) != 0 {
                                        skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, parentMatchInfo[len(parentMatchInfo)-1])
                                } else {
                                        skip, matchInfo, err = t.pm.MatchesUsingParentResults(relFilePath, patternmatcher.MatchInfo{})
                                }
                                if err != nil {
                                        log.G(context.TODO()).Errorf("Error matching %s: %v", relFilePath, err)
                                        return err
                                }

                                if f.IsDir() {
                                        parentDirs = append(parentDirs, relFilePath)
                                        parentMatchInfo = append(parentMatchInfo, matchInfo)
                                }
                        }

                        if skip {
                                // If we want to skip this file and its a directory
                                // then we should first check to see if there's an
                                // excludes pattern (e.g. !dir/file) that starts with this
                                // dir. If so then we can't skip this dir.

                                // Its not a dir then so we can just return/skip.
                                if !f.IsDir() {
                                        return nil
                                }

                                // No exceptions (!...) in patterns so just skip dir
                                if !t.pm.Exclusions() {
                                        return filepath.SkipDir
                                }

                                dirSlash := relFilePath + string(filepath.Separator)

                                for _, pat := range t.pm.Patterns() {
                                        if !pat.Exclusion() {
                                                continue
                                        }
                                        if strings.HasPrefix(pat.String()+string(filepath.Separator), dirSlash) {
                                                // found a match - so can't skip this dir
                                                return nil
                                        }
                                }

                                // No matching exclusion dir so just skip dir
                                return filepath.SkipDir
                        }

                        if seen[relFilePath] {
                                return nil
                        }
                        seen[relFilePath] = true

                        // Rename the base resource.
                        if rebaseName != "" {
                                var replacement string
                                if rebaseName != string(filepath.Separator) {
                                        // Special case the root directory to replace with an
                                        // empty string instead so that we don't end up with
                                        // double slashes in the paths.
                                        replacement = rebaseName
                                }

                                relFilePath = strings.Replace(relFilePath, include, replacement, 1)
                        }

                        if err := ta.addTarFile(filePath, relFilePath); err != nil {
                                log.G(context.TODO()).Errorf("Can't add file %s to tar: %s", filePath, err)
                                // if pipe is broken, stop writing tar stream to it
                                if errors.Is(err, io.ErrClosedPipe) {
                                        return err
                                }
                        }
                        return nil
                })
        }
}

// Unpack unpacks the decompressedArchive to dest with options.
func Unpack(decompressedArchive io.Reader, dest string, options *TarOptions) error {
        tr := tar.NewReader(decompressedArchive)

        var dirs []*tar.Header
        whiteoutConverter := getWhiteoutConverter(options.WhiteoutFormat)

        // Iterate through the files in the archive.
loop:
        for {
                hdr, err := tr.Next()
                if errors.Is(err, io.EOF) {
                        // end of tar archive
                        break
                }
                if err != nil {
                        return err
                }

                // ignore XGlobalHeader early to avoid creating parent directories for them
                if hdr.Typeflag == tar.TypeXGlobalHeader {
                        log.G(context.TODO()).Debugf("PAX Global Extended Headers found for %s and ignored", hdr.Name)
                        continue
                }

                // Normalize name, for safety and for a simple is-root check
                // This keeps "../" as-is, but normalizes "/../" to "/". Or Windows:
                // This keeps "..\" as-is, but normalizes "\..\" to "\".
                hdr.Name = filepath.Clean(hdr.Name)

                for _, exclude := range options.ExcludePatterns {
                        if strings.HasPrefix(hdr.Name, exclude) {
                                continue loop
                        }
                }

                // Ensure that the parent directory exists.
                err = createImpliedDirectories(dest, hdr, options)
                if err != nil {
                        return err
                }

                // #nosec G305 -- The joined path is checked for path traversal.
                path := filepath.Join(dest, hdr.Name)
                rel, err := filepath.Rel(dest, path)
                if err != nil {
                        return err
                }
                if strings.HasPrefix(rel, ".."+string(os.PathSeparator)) {
                        return breakoutError(fmt.Errorf("%q is outside of %q", hdr.Name, dest))
                }

                // If path exits we almost always just want to remove and replace it
                // The only exception is when it is a directory *and* the file from
                // the layer is also a directory. Then we want to merge them (i.e.
                // just apply the metadata from the layer).
                if fi, err := os.Lstat(path); err == nil {
                        if options.NoOverwriteDirNonDir && fi.IsDir() && hdr.Typeflag != tar.TypeDir {
                                // If NoOverwriteDirNonDir is true then we cannot replace
                                // an existing directory with a non-directory from the archive.
                                return fmt.Errorf("cannot overwrite directory %q with non-directory %q", path, dest)
                        }

                        if options.NoOverwriteDirNonDir && !fi.IsDir() && hdr.Typeflag == tar.TypeDir {
                                // If NoOverwriteDirNonDir is true then we cannot replace
                                // an existing non-directory with a directory from the archive.
                                return fmt.Errorf("cannot overwrite non-directory %q with directory %q", path, dest)
                        }

                        if fi.IsDir() && hdr.Name == "." {
                                continue
                        }

                        if !fi.IsDir() || hdr.Typeflag != tar.TypeDir {
                                if err := os.RemoveAll(path); err != nil {
                                        return err
                                }
                        }
                }

                if err := remapIDs(options.IDMap, hdr); err != nil {
                        return err
                }

                if whiteoutConverter != nil {
                        writeFile, err := whiteoutConverter.ConvertRead(hdr, path)
                        if err != nil {
                                return err
                        }
                        if !writeFile {
                                continue
                        }
                }

                if err := createTarFile(path, dest, hdr, tr, options); err != nil {
                        return err
                }

                // Directory mtimes must be handled at the end to avoid further
                // file creation in them to modify the directory mtime
                if hdr.Typeflag == tar.TypeDir {
                        dirs = append(dirs, hdr)
                }
        }

        for _, hdr := range dirs {
                // #nosec G305 -- The header was checked for path traversal before it was appended to the dirs slice.
                path := filepath.Join(dest, hdr.Name)

                if err := chtimes(path, boundTime(latestTime(hdr.AccessTime, hdr.ModTime)), boundTime(hdr.ModTime)); err != nil {
                        return err
                }
        }
        return nil
}

// createImpliedDirectories will create all parent directories of the current path with default permissions, if they do
// not already exist. This is possible as the tar format supports 'implicit' directories, where their existence is
// defined by the paths of files in the tar, but there are no header entries for the directories themselves, and thus
// we most both create them and choose metadata like permissions.
//
// The caller should have performed filepath.Clean(hdr.Name), so hdr.Name will now be in the filepath format for the OS
// on which the daemon is running. This precondition is required because this function assumes a OS-specific path
// separator when checking that a path is not the root.
func createImpliedDirectories(dest string, hdr *tar.Header, options *TarOptions) error {
        // Not the root directory, ensure that the parent directory exists
        if !strings.HasSuffix(hdr.Name, string(os.PathSeparator)) {
                parent := filepath.Dir(hdr.Name)
                parentPath := filepath.Join(dest, parent)
                if _, err := os.Lstat(parentPath); err != nil && os.IsNotExist(err) {
                        // RootPair() is confined inside this loop as most cases will not require a call, so we can spend some
                        // unneeded function calls in the uncommon case to encapsulate logic -- implied directories are a niche
                        // usage that reduces the portability of an image.
                        uid, gid := options.IDMap.RootPair()

                        err = user.MkdirAllAndChown(parentPath, ImpliedDirectoryMode, uid, gid, user.WithOnlyNew)
                        if err != nil {
                                return err
                        }
                }
        }

        return nil
}

// Untar reads a stream of bytes from `archive`, parses it as a tar archive,
// and unpacks it into the directory at `dest`.
// The archive may be compressed with one of the following algorithms:
// identity (uncompressed), gzip, bzip2, xz.
//
// FIXME: specify behavior when target path exists vs. doesn't exist.
func Untar(tarArchive io.Reader, dest string, options *TarOptions) error {
        return untarHandler(tarArchive, dest, options, true)
}

// UntarUncompressed reads a stream of bytes from `archive`, parses it as a tar archive,
// and unpacks it into the directory at `dest`.
// The archive must be an uncompressed stream.
func UntarUncompressed(tarArchive io.Reader, dest string, options *TarOptions) error {
        return untarHandler(tarArchive, dest, options, false)
}

// Handler for teasing out the automatic decompression
func untarHandler(tarArchive io.Reader, dest string, options *TarOptions, decompress bool) error {
        if tarArchive == nil {
                return errors.New("empty archive")
        }
        dest = filepath.Clean(dest)
        if options == nil {
                options = &TarOptions{}
        }
        if options.ExcludePatterns == nil {
                options.ExcludePatterns = []string{}
        }

        r := tarArchive
        if decompress {
                decompressedArchive, err := compression.DecompressStream(tarArchive)
                if err != nil {
                        return err
                }
                defer decompressedArchive.Close()
                r = decompressedArchive
        }

        return Unpack(r, dest, options)
}

// TarUntar is a convenience function which calls Tar and Untar, with the output of one piped into the other.
// If either Tar or Untar fails, TarUntar aborts and returns the error.
func (archiver *Archiver) TarUntar(src, dst string) error {
        archive, err := Tar(src, compression.None)
        if err != nil {
                return err
        }
        defer archive.Close()
        return archiver.Untar(archive, dst, &TarOptions{
                IDMap: archiver.IDMapping,
        })
}

// UntarPath untar a file from path to a destination, src is the source tar file path.
func (archiver *Archiver) UntarPath(src, dst string) error {
        archive, err := os.Open(src)
        if err != nil {
                return err
        }
        defer archive.Close()
        return archiver.Untar(archive, dst, &TarOptions{
                IDMap: archiver.IDMapping,
        })
}

// CopyWithTar creates a tar archive of filesystem path `src`, and
// unpacks it at filesystem path `dst`.
// The archive is streamed directly with fixed buffering and no
// intermediary disk IO.
func (archiver *Archiver) CopyWithTar(src, dst string) error {
        srcSt, err := os.Stat(src)
        if err != nil {
                return err
        }
        if !srcSt.IsDir() {
                return archiver.CopyFileWithTar(src, dst)
        }

        // if this Archiver is set up with ID mapping we need to create
        // the new destination directory with the remapped root UID/GID pair
        // as owner
        uid, gid := archiver.IDMapping.RootPair()
        // Create dst, copy src's content into it
        if err := user.MkdirAllAndChown(dst, 0o755, uid, gid, user.WithOnlyNew); err != nil {
                return err
        }
        return archiver.TarUntar(src, dst)
}

// CopyFileWithTar emulates the behavior of the 'cp' command-line
// for a single file. It copies a regular file from path `src` to
// path `dst`, and preserves all its metadata.
func (archiver *Archiver) CopyFileWithTar(src, dst string) (err error) {
        srcSt, err := os.Stat(src)
        if err != nil {
                return err
        }

        if srcSt.IsDir() {
                return errors.New("can't copy a directory")
        }

        // Clean up the trailing slash. This must be done in an operating
        // system specific manner.
        if dst[len(dst)-1] == os.PathSeparator {
                dst = filepath.Join(dst, filepath.Base(src))
        }
        // Create the holding directory if necessary
        if err := os.MkdirAll(filepath.Dir(dst), 0o700); err != nil {
                return err
        }

        r, w := io.Pipe()
        errC := make(chan error, 1)

        go func() {
                defer close(errC)

                errC <- func() error {
                        defer w.Close()

                        srcF, err := os.Open(src)
                        if err != nil {
                                return err
                        }
                        defer srcF.Close()

                        hdr, err := tarheader.FileInfoHeaderNoLookups(srcSt, "")
                        if err != nil {
                                return err
                        }
                        hdr.Format = tar.FormatPAX
                        hdr.ModTime = hdr.ModTime.Truncate(time.Second)
                        hdr.AccessTime = time.Time{}
                        hdr.ChangeTime = time.Time{}
                        hdr.Name = filepath.Base(dst)
                        hdr.Mode = int64(chmodTarEntry(os.FileMode(hdr.Mode)))

                        if err := remapIDs(archiver.IDMapping, hdr); err != nil {
                                return err
                        }

                        tw := tar.NewWriter(w)
                        defer tw.Close()
                        if err := tw.WriteHeader(hdr); err != nil {
                                return err
                        }
                        if err := copyWithBuffer(tw, srcF); err != nil {
                                return err
                        }
                        return nil
                }()
        }()
        defer func() {
                if er := <-errC; err == nil && er != nil {
                        err = er
                }
        }()

        err = archiver.Untar(r, filepath.Dir(dst), nil)
        if err != nil {
                r.CloseWithError(err)
        }
        return err
}

// IdentityMapping returns the IdentityMapping of the archiver.
func (archiver *Archiver) IdentityMapping() user.IdentityMapping {
        return archiver.IDMapping
}

func remapIDs(idMapping user.IdentityMapping, hdr *tar.Header) error {
        uid, gid, err := idMapping.ToHost(hdr.Uid, hdr.Gid)
        hdr.Uid, hdr.Gid = uid, gid
        return err
}

package archive

import (
        "archive/tar"
        "fmt"
        "os"
        "path/filepath"
        "strings"

        "github.com/moby/sys/userns"
        "golang.org/x/sys/unix"
)

func getWhiteoutConverter(format WhiteoutFormat) tarWhiteoutConverter {
        if format == OverlayWhiteoutFormat {
                return overlayWhiteoutConverter{}
        }
        return nil
}

type overlayWhiteoutConverter struct{}

func (overlayWhiteoutConverter) ConvertWrite(hdr *tar.Header, path string, fi os.FileInfo) (wo *tar.Header, _ error) {
        // convert whiteouts to AUFS format
        if fi.Mode()&os.ModeCharDevice != 0 && hdr.Devmajor == 0 && hdr.Devminor == 0 {
                // we just rename the file and make it normal
                dir, filename := filepath.Split(hdr.Name)
                hdr.Name = filepath.Join(dir, WhiteoutPrefix+filename)
                hdr.Mode = 0o600
                hdr.Typeflag = tar.TypeReg
                hdr.Size = 0
        }

        if fi.Mode()&os.ModeDir == 0 {
                // FIXME(thaJeztah): return a sentinel error instead of nil, nil
                return nil, nil
        }

        opaqueXattrName := "trusted.overlay.opaque"
        if userns.RunningInUserNS() {
                opaqueXattrName = "user.overlay.opaque"
        }

        // convert opaque dirs to AUFS format by writing an empty file with the prefix
        opaque, err := lgetxattr(path, opaqueXattrName)
        if err != nil {
                return nil, err
        }
        if len(opaque) != 1 || opaque[0] != 'y' {
                // FIXME(thaJeztah): return a sentinel error instead of nil, nil
                return nil, nil
        }
        delete(hdr.PAXRecords, paxSchilyXattr+opaqueXattrName)

        // create a header for the whiteout file
        // it should inherit some properties from the parent, but be a regular file
        return &tar.Header{
                Typeflag:   tar.TypeReg,
                Mode:       hdr.Mode & int64(os.ModePerm),
                Name:       filepath.Join(hdr.Name, WhiteoutOpaqueDir), // #nosec G305 -- An archive is being created, not extracted.
                Size:       0,
                Uid:        hdr.Uid,
                Uname:      hdr.Uname,
                Gid:        hdr.Gid,
                Gname:      hdr.Gname,
                AccessTime: hdr.AccessTime,
                ChangeTime: hdr.ChangeTime,
        }, nil
}

func (c overlayWhiteoutConverter) ConvertRead(hdr *tar.Header, path string) (bool, error) {
        base := filepath.Base(path)
        dir := filepath.Dir(path)

        // if a directory is marked as opaque by the AUFS special file, we need to translate that to overlay
        if base == WhiteoutOpaqueDir {
                opaqueXattrName := "trusted.overlay.opaque"
                if userns.RunningInUserNS() {
                        opaqueXattrName = "user.overlay.opaque"
                }

                err := unix.Setxattr(dir, opaqueXattrName, []byte{'y'}, 0)
                if err != nil {
                        return false, fmt.Errorf("setxattr('%s', %s=y): %w", dir, opaqueXattrName, err)
                }
                // don't write the file itself
                return false, err
        }

        // if a file was deleted and we are using overlay, we need to create a character device
        if strings.HasPrefix(base, WhiteoutPrefix) {
                originalBase := base[len(WhiteoutPrefix):]
                originalPath := filepath.Join(dir, originalBase)

                if err := unix.Mknod(originalPath, unix.S_IFCHR, 0); err != nil {
                        return false, fmt.Errorf("failed to mknod('%s', S_IFCHR, 0): %w", originalPath, err)
                }
                if err := os.Chown(originalPath, hdr.Uid, hdr.Gid); err != nil {
                        return false, err
                }

                // don't write the file itself
                return false, nil
        }

        return true, nil
}

//go:build !windows

package archive

import (
        "archive/tar"
        "errors"
        "os"
        "path/filepath"
        "strings"
        "syscall"

        "golang.org/x/sys/unix"
)

// addLongPathPrefix adds the Windows long path prefix to the path provided if
// it does not already have it. It is a no-op on platforms other than Windows.
func addLongPathPrefix(srcPath string) string {
        return srcPath
}

// getWalkRoot calculates the root path when performing a TarWithOptions.
// We use a separate function as this is platform specific. On Linux, we
// can't use filepath.Join(srcPath,include) because this will clean away
// a trailing "." or "/" which may be important.
func getWalkRoot(srcPath string, include string) string {
        return strings.TrimSuffix(srcPath, string(filepath.Separator)) + string(filepath.Separator) + include
}

// chmodTarEntry is used to adjust the file permissions used in tar header based
// on the platform the archival is done.
func chmodTarEntry(perm os.FileMode) os.FileMode {
        return perm // noop for unix as golang APIs provide perm bits correctly
}

func getInodeFromStat(stat interface{}) (uint64, error) {
        s, ok := stat.(*syscall.Stat_t)
        if !ok {
                // FIXME(thaJeztah): this should likely return an error; see https://github.com/moby/moby/pull/49493#discussion_r1979152897
                return 0, nil
        }
        return s.Ino, nil
}

func getFileUIDGID(stat interface{}) (int, int, error) {
        s, ok := stat.(*syscall.Stat_t)

        if !ok {
                return 0, 0, errors.New("cannot convert stat value to syscall.Stat_t")
        }
        return int(s.Uid), int(s.Gid), nil
}

// handleTarTypeBlockCharFifo is an OS-specific helper function used by
// createTarFile to handle the following types of header: Block; Char; Fifo.
//
// Creating device nodes is not supported when running in a user namespace,
// produces a [syscall.EPERM] in most cases.
func handleTarTypeBlockCharFifo(hdr *tar.Header, path string) error {
        mode := uint32(hdr.Mode & 0o7777)
        switch hdr.Typeflag {
        case tar.TypeBlock:
                mode |= unix.S_IFBLK
        case tar.TypeChar:
                mode |= unix.S_IFCHR
        case tar.TypeFifo:
                mode |= unix.S_IFIFO
        }

        return mknod(path, mode, unix.Mkdev(uint32(hdr.Devmajor), uint32(hdr.Devminor)))
}

func handleLChmod(hdr *tar.Header, path string, hdrInfo os.FileInfo) error {
        if hdr.Typeflag == tar.TypeLink {
                if fi, err := os.Lstat(hdr.Linkname); err == nil && (fi.Mode()&os.ModeSymlink == 0) {
                        if err := os.Chmod(path, hdrInfo.Mode()); err != nil {
                                return err
                        }
                }
        } else if hdr.Typeflag != tar.TypeSymlink {
                if err := os.Chmod(path, hdrInfo.Mode()); err != nil {
                        return err
                }
        }
        return nil
}

package archive

import (
        "archive/tar"
        "bytes"
        "context"
        "fmt"
        "io"
        "io/fs"
        "os"
        "path/filepath"
        "sort"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/moby/sys/user"
)

// ChangeType represents the change type.
type ChangeType int

const (
        ChangeModify = 0 // ChangeModify represents the modify operation.
        ChangeAdd    = 1 // ChangeAdd represents the add operation.
        ChangeDelete = 2 // ChangeDelete represents the delete operation.
)

func (c ChangeType) String() string {
        switch c {
        case ChangeModify:
                return "C"
        case ChangeAdd:
                return "A"
        case ChangeDelete:
                return "D"
        }
        return ""
}

// Change represents a change, it wraps the change type and path.
// It describes changes of the files in the path respect to the
// parent layers. The change could be modify, add, delete.
// This is used for layer diff.
type Change struct {
        Path string
        Kind ChangeType
}

func (change *Change) String() string {
        return fmt.Sprintf("%s %s", change.Kind, change.Path)
}

// for sort.Sort
type changesByPath []Change

func (c changesByPath) Less(i, j int) bool { return c[i].Path < c[j].Path }
func (c changesByPath) Len() int           { return len(c) }
func (c changesByPath) Swap(i, j int)      { c[j], c[i] = c[i], c[j] }

// Gnu tar doesn't have sub-second mtime precision. The go tar
// writer (1.10+) does when using PAX format, but we round times to seconds
// to ensure archives have the same hashes for backwards compatibility.
// See https://github.com/moby/moby/pull/35739/commits/fb170206ba12752214630b269a40ac7be6115ed4.
//
// Non-sub-second is problematic when we apply changes via tar
// files. We handle this by comparing for exact times, *or* same
// second count and either a or b having exactly 0 nanoseconds
func sameFsTime(a, b time.Time) bool {
        return a.Equal(b) ||
                (a.Unix() == b.Unix() &&
                        (a.Nanosecond() == 0 || b.Nanosecond() == 0))
}

// Changes walks the path rw and determines changes for the files in the path,
// with respect to the parent layers
func Changes(layers []string, rw string) ([]Change, error) {
        return collectChanges(layers, rw, aufsDeletedFile, aufsMetadataSkip)
}

func aufsMetadataSkip(path string) (skip bool, err error) {
        skip, err = filepath.Match(string(os.PathSeparator)+WhiteoutMetaPrefix+"*", path)
        if err != nil {
                skip = true
        }
        return skip, err
}

func aufsDeletedFile(root, path string, fi os.FileInfo) (string, error) {
        f := filepath.Base(path)

        // If there is a whiteout, then the file was removed
        if strings.HasPrefix(f, WhiteoutPrefix) {
                originalFile := f[len(WhiteoutPrefix):]
                return filepath.Join(filepath.Dir(path), originalFile), nil
        }

        return "", nil
}

type (
        skipChange   func(string) (bool, error)
        deleteChange func(string, string, os.FileInfo) (string, error)
)

func collectChanges(layers []string, rw string, dc deleteChange, sc skipChange) ([]Change, error) {
        var (
                changes     []Change
                changedDirs = make(map[string]struct{})
        )

        err := filepath.Walk(rw, func(path string, f os.FileInfo, err error) error {
                if err != nil {
                        return err
                }

                // Rebase path
                path, err = filepath.Rel(rw, path)
                if err != nil {
                        return err
                }

                // As this runs on the daemon side, file paths are OS specific.
                path = filepath.Join(string(os.PathSeparator), path)

                // Skip root
                if path == string(os.PathSeparator) {
                        return nil
                }

                if sc != nil {
                        if skip, err := sc(path); skip {
                                return err
                        }
                }

                change := Change{
                        Path: path,
                }

                deletedFile, err := dc(rw, path, f)
                if err != nil {
                        return err
                }

                // Find out what kind of modification happened
                if deletedFile != "" {
                        change.Path = deletedFile
                        change.Kind = ChangeDelete
                } else {
                        // Otherwise, the file was added
                        change.Kind = ChangeAdd

                        // ...Unless it already existed in a top layer, in which case, it's a modification
                        for _, layer := range layers {
                                stat, err := os.Stat(filepath.Join(layer, path))
                                if err != nil && !os.IsNotExist(err) {
                                        return err
                                }
                                if err == nil {
                                        // The file existed in the top layer, so that's a modification

                                        // However, if it's a directory, maybe it wasn't actually modified.
                                        // If you modify /foo/bar/baz, then /foo will be part of the changed files only because it's the parent of bar
                                        if stat.IsDir() && f.IsDir() {
                                                if f.Size() == stat.Size() && f.Mode() == stat.Mode() && sameFsTime(f.ModTime(), stat.ModTime()) {
                                                        // Both directories are the same, don't record the change
                                                        return nil
                                                }
                                        }
                                        change.Kind = ChangeModify
                                        break
                                }
                        }
                }

                // If /foo/bar/file.txt is modified, then /foo/bar must be part of the changed files.
                // This block is here to ensure the change is recorded even if the
                // modify time, mode and size of the parent directory in the rw and ro layers are all equal.
                // Check https://github.com/docker/docker/pull/13590 for details.
                if f.IsDir() {
                        changedDirs[path] = struct{}{}
                }
                if change.Kind == ChangeAdd || change.Kind == ChangeDelete {
                        parent := filepath.Dir(path)
                        if _, ok := changedDirs[parent]; !ok && parent != "/" {
                                changes = append(changes, Change{Path: parent, Kind: ChangeModify})
                                changedDirs[parent] = struct{}{}
                        }
                }

                // Record change
                changes = append(changes, change)
                return nil
        })
        if err != nil && !os.IsNotExist(err) {
                return nil, err
        }
        return changes, nil
}

// FileInfo describes the information of a file.
type FileInfo struct {
        parent     *FileInfo
        name       string
        stat       fs.FileInfo
        children   map[string]*FileInfo
        capability []byte
        added      bool
}

// LookUp looks up the file information of a file.
func (info *FileInfo) LookUp(path string) *FileInfo {
        // As this runs on the daemon side, file paths are OS specific.
        parent := info
        if path == string(os.PathSeparator) {
                return info
        }

        pathElements := strings.Split(path, string(os.PathSeparator))
        for _, elem := range pathElements {
                if elem != "" {
                        child := parent.children[elem]
                        if child == nil {
                                return nil
                        }
                        parent = child
                }
        }
        return parent
}

func (info *FileInfo) path() string {
        if info.parent == nil {
                // As this runs on the daemon side, file paths are OS specific.
                return string(os.PathSeparator)
        }
        return filepath.Join(info.parent.path(), info.name)
}

func (info *FileInfo) addChanges(oldInfo *FileInfo, changes *[]Change) {
        sizeAtEntry := len(*changes)

        if oldInfo == nil {
                // add
                change := Change{
                        Path: info.path(),
                        Kind: ChangeAdd,
                }
                *changes = append(*changes, change)
                info.added = true
        }

        // We make a copy so we can modify it to detect additions
        // also, we only recurse on the old dir if the new info is a directory
        // otherwise any previous delete/change is considered recursive
        oldChildren := make(map[string]*FileInfo)
        if oldInfo != nil && info.isDir() {
                for k, v := range oldInfo.children {
                        oldChildren[k] = v
                }
        }

        for name, newChild := range info.children {
                oldChild := oldChildren[name]
                if oldChild != nil {
                        // change?
                        oldStat := oldChild.stat
                        newStat := newChild.stat
                        // Note: We can't compare inode or ctime or blocksize here, because these change
                        // when copying a file into a container. However, that is not generally a problem
                        // because any content change will change mtime, and any status change should
                        // be visible when actually comparing the stat fields. The only time this
                        // breaks down is if some code intentionally hides a change by setting
                        // back mtime
                        if statDifferent(oldStat, newStat) ||
                                !bytes.Equal(oldChild.capability, newChild.capability) {
                                change := Change{
                                        Path: newChild.path(),
                                        Kind: ChangeModify,
                                }
                                *changes = append(*changes, change)
                                newChild.added = true
                        }

                        // Remove from copy so we can detect deletions
                        delete(oldChildren, name)
                }

                newChild.addChanges(oldChild, changes)
        }
        for _, oldChild := range oldChildren {
                // delete
                change := Change{
                        Path: oldChild.path(),
                        Kind: ChangeDelete,
                }
                *changes = append(*changes, change)
        }

        // If there were changes inside this directory, we need to add it, even if the directory
        // itself wasn't changed. This is needed to properly save and restore filesystem permissions.
        // As this runs on the daemon side, file paths are OS specific.
        if len(*changes) > sizeAtEntry && info.isDir() && !info.added && info.path() != string(os.PathSeparator) {
                change := Change{
                        Path: info.path(),
                        Kind: ChangeModify,
                }
                // Let's insert the directory entry before the recently added entries located inside this dir
                *changes = append(*changes, change) // just to resize the slice, will be overwritten
                copy((*changes)[sizeAtEntry+1:], (*changes)[sizeAtEntry:])
                (*changes)[sizeAtEntry] = change
        }
}

// Changes add changes to file information.
func (info *FileInfo) Changes(oldInfo *FileInfo) []Change {
        var changes []Change

        info.addChanges(oldInfo, &changes)

        return changes
}

func newRootFileInfo() *FileInfo {
        // As this runs on the daemon side, file paths are OS specific.
        root := &FileInfo{
                name:     string(os.PathSeparator),
                children: make(map[string]*FileInfo),
        }
        return root
}

// ChangesDirs compares two directories and generates an array of Change objects describing the changes.
// If oldDir is "", then all files in newDir will be Add-Changes.
func ChangesDirs(newDir, oldDir string) ([]Change, error) {
        var oldRoot, newRoot *FileInfo
        if oldDir == "" {
                emptyDir, err := os.MkdirTemp("", "empty")
                if err != nil {
                        return nil, err
                }
                defer os.Remove(emptyDir)
                oldDir = emptyDir
        }
        oldRoot, newRoot, err := collectFileInfoForChanges(oldDir, newDir)
        if err != nil {
                return nil, err
        }

        return newRoot.Changes(oldRoot), nil
}

// ChangesSize calculates the size in bytes of the provided changes, based on newDir.
func ChangesSize(newDir string, changes []Change) int64 {
        var (
                size int64
                sf   = make(map[uint64]struct{})
        )
        for _, change := range changes {
                if change.Kind == ChangeModify || change.Kind == ChangeAdd {
                        file := filepath.Join(newDir, change.Path)
                        fileInfo, err := os.Lstat(file)
                        if err != nil {
                                log.G(context.TODO()).Errorf("Can not stat %q: %s", file, err)
                                continue
                        }

                        if fileInfo != nil && !fileInfo.IsDir() {
                                if hasHardlinks(fileInfo) {
                                        inode := getIno(fileInfo)
                                        if _, ok := sf[inode]; !ok {
                                                size += fileInfo.Size()
                                                sf[inode] = struct{}{}
                                        }
                                } else {
                                        size += fileInfo.Size()
                                }
                        }
                }
        }
        return size
}

// ExportChanges produces an Archive from the provided changes, relative to dir.
func ExportChanges(dir string, changes []Change, idMap user.IdentityMapping) (io.ReadCloser, error) {
        reader, writer := io.Pipe()
        go func() {
                ta := newTarAppender(idMap, writer, nil)

                sort.Sort(changesByPath(changes))

                // In general we log errors here but ignore them because
                // during e.g. a diff operation the container can continue
                // mutating the filesystem and we can see transient errors
                // from this
                for _, change := range changes {
                        if change.Kind == ChangeDelete {
                                whiteOutDir := filepath.Dir(change.Path)
                                whiteOutBase := filepath.Base(change.Path)
                                whiteOut := filepath.Join(whiteOutDir, WhiteoutPrefix+whiteOutBase)
                                timestamp := time.Now()
                                hdr := &tar.Header{
                                        Name:       whiteOut[1:],
                                        Size:       0,
                                        ModTime:    timestamp,
                                        AccessTime: timestamp,
                                        ChangeTime: timestamp,
                                }
                                if err := ta.TarWriter.WriteHeader(hdr); err != nil {
                                        log.G(context.TODO()).Debugf("Can't write whiteout header: %s", err)
                                }
                        } else {
                                path := filepath.Join(dir, change.Path)
                                if err := ta.addTarFile(path, change.Path[1:]); err != nil {
                                        log.G(context.TODO()).Debugf("Can't add file %s to tar: %s", path, err)
                                }
                        }
                }

                // Make sure to check the error on Close.
                if err := ta.TarWriter.Close(); err != nil {
                        log.G(context.TODO()).Debugf("Can't close layer: %s", err)
                }
                if err := writer.Close(); err != nil {
                        log.G(context.TODO()).Debugf("failed close Changes writer: %s", err)
                }
        }()
        return reader, nil
}

package archive

import (
        "fmt"
        "os"
        "path/filepath"
        "sort"
        "strings"
        "syscall"
        "unsafe"

        "golang.org/x/sys/unix"
)

// walker is used to implement collectFileInfoForChanges on linux. Where this
// method in general returns the entire contents of two directory trees, we
// optimize some FS calls out on linux. In particular, we take advantage of the
// fact that getdents(2) returns the inode of each file in the directory being
// walked, which, when walking two trees in parallel to generate a list of
// changes, can be used to prune subtrees without ever having to lstat(2) them
// directly. Eliminating stat calls in this way can save up to seconds on large
// images.
type walker struct {
        dir1  string
        dir2  string
        root1 *FileInfo
        root2 *FileInfo
}

// collectFileInfoForChanges returns a complete representation of the trees
// rooted at dir1 and dir2, with one important exception: any subtree or
// leaf where the inode and device numbers are an exact match between dir1
// and dir2 will be pruned from the results. This method is *only* to be used
// to generating a list of changes between the two directories, as it does not
// reflect the full contents.
func collectFileInfoForChanges(dir1, dir2 string) (*FileInfo, *FileInfo, error) {
        w := &walker{
                dir1:  dir1,
                dir2:  dir2,
                root1: newRootFileInfo(),
                root2: newRootFileInfo(),
        }

        i1, err := os.Lstat(w.dir1)
        if err != nil {
                return nil, nil, err
        }
        i2, err := os.Lstat(w.dir2)
        if err != nil {
                return nil, nil, err
        }

        if err := w.walk("/", i1, i2); err != nil {
                return nil, nil, err
        }

        return w.root1, w.root2, nil
}

// Given a FileInfo, its path info, and a reference to the root of the tree
// being constructed, register this file with the tree.
func walkchunk(path string, fi os.FileInfo, dir string, root *FileInfo) error {
        if fi == nil {
                return nil
        }
        parent := root.LookUp(filepath.Dir(path))
        if parent == nil {
                return fmt.Errorf("walkchunk: Unexpectedly no parent for %s", path)
        }
        info := &FileInfo{
                name:     filepath.Base(path),
                children: make(map[string]*FileInfo),
                parent:   parent,
        }
        cpath := filepath.Join(dir, path)
        info.stat = fi
        info.capability, _ = lgetxattr(cpath, "security.capability") // lgetxattr(2): fs access
        parent.children[info.name] = info
        return nil
}

// Walk a subtree rooted at the same path in both trees being iterated. For
// example, /docker/overlay/1234/a/b/c/d and /docker/overlay/8888/a/b/c/d
func (w *walker) walk(path string, i1, i2 os.FileInfo) (err error) {
        // Register these nodes with the return trees, unless we're still at the
        // (already-created) roots:
        if path != "/" {
                if err := walkchunk(path, i1, w.dir1, w.root1); err != nil {
                        return err
                }
                if err := walkchunk(path, i2, w.dir2, w.root2); err != nil {
                        return err
                }
        }

        is1Dir := i1 != nil && i1.IsDir()
        is2Dir := i2 != nil && i2.IsDir()

        sameDevice := false
        if i1 != nil && i2 != nil {
                si1 := i1.Sys().(*syscall.Stat_t)
                si2 := i2.Sys().(*syscall.Stat_t)
                if si1.Dev == si2.Dev {
                        sameDevice = true
                }
        }

        // If these files are both non-existent, or leaves (non-dirs), we are done.
        if !is1Dir && !is2Dir {
                return nil
        }

        // Fetch the names of all the files contained in both directories being walked:
        var names1, names2 []nameIno
        if is1Dir {
                names1, err = readdirnames(filepath.Join(w.dir1, path)) // getdents(2): fs access
                if err != nil {
                        return err
                }
        }
        if is2Dir {
                names2, err = readdirnames(filepath.Join(w.dir2, path)) // getdents(2): fs access
                if err != nil {
                        return err
                }
        }

        // We have lists of the files contained in both parallel directories, sorted
        // in the same order. Walk them in parallel, generating a unique merged list
        // of all items present in either or both directories.
        var names []string
        ix1 := 0
        ix2 := 0

        for ix1 < len(names1) && ix2 < len(names2) {
                ni1 := names1[ix1]
                ni2 := names2[ix2]

                switch strings.Compare(ni1.name, ni2.name) {
                case -1: // ni1 < ni2 -- advance ni1
                        // we will not encounter ni1 in names2
                        names = append(names, ni1.name)
                        ix1++
                case 0: // ni1 == ni2
                        if ni1.ino != ni2.ino || !sameDevice {
                                names = append(names, ni1.name)
                        }
                        ix1++
                        ix2++
                case 1: // ni1 > ni2 -- advance ni2
                        // we will not encounter ni2 in names1
                        names = append(names, ni2.name)
                        ix2++
                }
        }
        for ix1 < len(names1) {
                names = append(names, names1[ix1].name)
                ix1++
        }
        for ix2 < len(names2) {
                names = append(names, names2[ix2].name)
                ix2++
        }

        // For each of the names present in either or both of the directories being
        // iterated, stat the name under each root, and recurse the pair of them:
        for _, name := range names {
                fname := filepath.Join(path, name)
                var cInfo1, cInfo2 os.FileInfo
                if is1Dir {
                        cInfo1, err = os.Lstat(filepath.Join(w.dir1, fname)) // lstat(2): fs access
                        if err != nil && !os.IsNotExist(err) {
                                return err
                        }
                }
                if is2Dir {
                        cInfo2, err = os.Lstat(filepath.Join(w.dir2, fname)) // lstat(2): fs access
                        if err != nil && !os.IsNotExist(err) {
                                return err
                        }
                }
                if err = w.walk(fname, cInfo1, cInfo2); err != nil {
                        return err
                }
        }
        return nil
}

// {name,inode} pairs used to support the early-pruning logic of the walker type
type nameIno struct {
        name string
        ino  uint64
}

type nameInoSlice []nameIno

func (s nameInoSlice) Len() int           { return len(s) }
func (s nameInoSlice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
func (s nameInoSlice) Less(i, j int) bool { return s[i].name < s[j].name }

// readdirnames is a hacked-apart version of the Go stdlib code, exposing inode
// numbers further up the stack when reading directory contents. Unlike
// os.Readdirnames, which returns a list of filenames, this function returns a
// list of {filename,inode} pairs.
func readdirnames(dirname string) (names []nameIno, err error) {
        var (
                size = 100
                buf  = make([]byte, 4096)
                nbuf int
                bufp int
                nb   int
        )

        f, err := os.Open(dirname)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        names = make([]nameIno, 0, size) // Empty with room to grow.
        for {
                // Refill the buffer if necessary
                if bufp >= nbuf {
                        bufp = 0
                        nbuf, err = unix.ReadDirent(int(f.Fd()), buf) // getdents on linux
                        if nbuf < 0 {
                                nbuf = 0
                        }
                        if err != nil {
                                return nil, os.NewSyscallError("readdirent", err)
                        }
                        if nbuf <= 0 {
                                break // EOF
                        }
                }

                // Drain the buffer
                nb, names = parseDirent(buf[bufp:nbuf], names)
                bufp += nb
        }

        sl := nameInoSlice(names)
        sort.Sort(sl)
        return sl, nil
}

// parseDirent is a minor modification of unix.ParseDirent (linux version)
// which returns {name,inode} pairs instead of just names.
func parseDirent(buf []byte, names []nameIno) (consumed int, newnames []nameIno) {
        origlen := len(buf)
        for len(buf) > 0 {
                dirent := (*unix.Dirent)(unsafe.Pointer(&buf[0])) // #nosec G103 -- Ignore "G103: Use of unsafe calls should be audited"
                buf = buf[dirent.Reclen:]
                if dirent.Ino == 0 { // File absent in directory.
                        continue
                }
                b := (*[10000]byte)(unsafe.Pointer(&dirent.Name[0])) // #nosec G103 -- Ignore "G103: Use of unsafe calls should be audited"
                name := string(b[0:clen(b[:])])
                if name == "." || name == ".." { // Useless names
                        continue
                }
                names = append(names, nameIno{name, dirent.Ino})
        }
        return origlen - len(buf), names
}

func clen(n []byte) int {
        for i := 0; i < len(n); i++ {
                if n[i] == 0 {
                        return i
                }
        }
        return len(n)
}

//go:build !windows

package archive

import (
        "io/fs"
        "os"
        "syscall"
)

func statDifferent(oldStat fs.FileInfo, newStat fs.FileInfo) bool {
        oldSys := oldStat.Sys().(*syscall.Stat_t)
        newSys := newStat.Sys().(*syscall.Stat_t)
        // Don't look at size for dirs, its not a good measure of change
        if oldStat.Mode() != newStat.Mode() ||
                oldSys.Uid != newSys.Uid ||
                oldSys.Gid != newSys.Gid ||
                oldSys.Rdev != newSys.Rdev ||
                // Don't look at size or modification time for dirs, its not a good
                // measure of change. See https://github.com/moby/moby/issues/9874
                // for a description of the issue with modification time, and
                // https://github.com/moby/moby/pull/11422 for the change.
                // (Note that in the Windows implementation of this function,
                // modification time IS taken as a change). See
                // https://github.com/moby/moby/pull/37982 for more information.
                (!oldStat.Mode().IsDir() &&
                        (!sameFsTime(oldStat.ModTime(), newStat.ModTime()) || (oldStat.Size() != newStat.Size()))) {
                return true
        }
        return false
}

func (info *FileInfo) isDir() bool {
        return info.parent == nil || info.stat.Mode().IsDir()
}

func getIno(fi os.FileInfo) uint64 {
        return fi.Sys().(*syscall.Stat_t).Ino
}

func hasHardlinks(fi os.FileInfo) bool {
        return fi.Sys().(*syscall.Stat_t).Nlink > 1
}

package compression

import (
        "bufio"
        "bytes"
        "compress/bzip2"
        "compress/gzip"
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "os/exec"
        "strconv"
        "sync"

        "github.com/containerd/log"
        "github.com/klauspost/compress/zstd"
)

// Compression is the state represents if compressed or not.
type Compression int

const (
        None  Compression = 0 // None represents the uncompressed.
        Bzip2 Compression = 1 // Bzip2 is bzip2 compression algorithm.
        Gzip  Compression = 2 // Gzip is gzip compression algorithm.
        Xz    Compression = 3 // Xz is xz compression algorithm.
        Zstd  Compression = 4 // Zstd is zstd compression algorithm.
)

// Extension returns the extension of a file that uses the specified compression algorithm.
func (c *Compression) Extension() string {
        switch *c {
        case None:
                return "tar"
        case Bzip2:
                return "tar.bz2"
        case Gzip:
                return "tar.gz"
        case Xz:
                return "tar.xz"
        case Zstd:
                return "tar.zst"
        default:
                return ""
        }
}

type readCloserWrapper struct {
        io.Reader
        closer func() error
}

func (r *readCloserWrapper) Close() error {
        if r.closer != nil {
                return r.closer()
        }
        return nil
}

type nopWriteCloser struct {
        io.Writer
}

func (nopWriteCloser) Close() error { return nil }

var bufioReader32KPool = &sync.Pool{
        New: func() interface{} { return bufio.NewReaderSize(nil, 32*1024) },
}

type bufferedReader struct {
        buf *bufio.Reader
}

func newBufferedReader(r io.Reader) *bufferedReader {
        buf := bufioReader32KPool.Get().(*bufio.Reader)
        buf.Reset(r)
        return &bufferedReader{buf}
}

func (r *bufferedReader) Read(p []byte) (int, error) {
        if r.buf == nil {
                return 0, io.EOF
        }
        n, err := r.buf.Read(p)
        if errors.Is(err, io.EOF) {
                r.buf.Reset(nil)
                bufioReader32KPool.Put(r.buf)
                r.buf = nil
        }
        return n, err
}

func (r *bufferedReader) Peek(n int) ([]byte, error) {
        if r.buf == nil {
                return nil, io.EOF
        }
        return r.buf.Peek(n)
}

// DecompressStream decompresses the archive and returns a ReaderCloser with the decompressed archive.
func DecompressStream(archive io.Reader) (io.ReadCloser, error) {
        buf := newBufferedReader(archive)
        bs, err := buf.Peek(10)
        if err != nil && !errors.Is(err, io.EOF) {
                // Note: we'll ignore any io.EOF error because there are some odd
                // cases where the layer.tar file will be empty (zero bytes) and
                // that results in an io.EOF from the Peek() call. So, in those
                // cases we'll just treat it as a non-compressed stream and
                // that means just create an empty layer.
                // See Issue 18170
                return nil, err
        }

        switch compression := Detect(bs); compression {
        case None:
                return &readCloserWrapper{
                        Reader: buf,
                }, nil
        case Gzip:
                ctx, cancel := context.WithCancel(context.Background())
                gzReader, err := gzipDecompress(ctx, buf)
                if err != nil {
                        cancel()
                        return nil, err
                }

                return &readCloserWrapper{
                        Reader: gzReader,
                        closer: func() error {
                                cancel()
                                return gzReader.Close()
                        },
                }, nil
        case Bzip2:
                bz2Reader := bzip2.NewReader(buf)
                return &readCloserWrapper{
                        Reader: bz2Reader,
                }, nil
        case Xz:
                ctx, cancel := context.WithCancel(context.Background())

                xzReader, err := xzDecompress(ctx, buf)
                if err != nil {
                        cancel()
                        return nil, err
                }

                return &readCloserWrapper{
                        Reader: xzReader,
                        closer: func() error {
                                cancel()
                                return xzReader.Close()
                        },
                }, nil
        case Zstd:
                zstdReader, err := zstd.NewReader(buf)
                if err != nil {
                        return nil, err
                }
                return &readCloserWrapper{
                        Reader: zstdReader,
                        closer: func() error {
                                zstdReader.Close()
                                return nil
                        },
                }, nil

        default:
                return nil, fmt.Errorf("unsupported compression format (%d)", compression)
        }
}

// CompressStream compresses the dest with specified compression algorithm.
func CompressStream(dest io.Writer, compression Compression) (io.WriteCloser, error) {
        switch compression {
        case None:
                return nopWriteCloser{dest}, nil
        case Gzip:
                return gzip.NewWriter(dest), nil
        case Bzip2:
                // archive/bzip2 does not support writing.
                return nil, errors.New("unsupported compression format: tar.bz2")
        case Xz:
                // there is no xz support at all
                // However, this is not a problem as docker only currently generates gzipped tars
                return nil, errors.New("unsupported compression format: tar.xz")
        default:
                return nil, fmt.Errorf("unsupported compression format (%d)", compression)
        }
}

func xzDecompress(ctx context.Context, archive io.Reader) (io.ReadCloser, error) {
        args := []string{"xz", "-d", "-c", "-q"}

        return cmdStream(exec.CommandContext(ctx, args[0], args[1:]...), archive)
}

func gzipDecompress(ctx context.Context, buf io.Reader) (io.ReadCloser, error) {
        if noPigzEnv := os.Getenv("MOBY_DISABLE_PIGZ"); noPigzEnv != "" {
                noPigz, err := strconv.ParseBool(noPigzEnv)
                if err != nil {
                        log.G(ctx).WithError(err).Warn("invalid value in MOBY_DISABLE_PIGZ env var")
                }
                if noPigz {
                        log.G(ctx).Debugf("Use of pigz is disabled due to MOBY_DISABLE_PIGZ=%s", noPigzEnv)
                        return gzip.NewReader(buf)
                }
        }

        unpigzPath, err := exec.LookPath("unpigz")
        if err != nil {
                log.G(ctx).Debugf("unpigz binary not found, falling back to go gzip library")
                return gzip.NewReader(buf)
        }

        log.G(ctx).Debugf("Using %s to decompress", unpigzPath)

        return cmdStream(exec.CommandContext(ctx, unpigzPath, "-d", "-c"), buf)
}

// cmdStream executes a command, and returns its stdout as a stream.
// If the command fails to run or doesn't complete successfully, an error
// will be returned, including anything written on stderr.
func cmdStream(cmd *exec.Cmd, in io.Reader) (io.ReadCloser, error) {
        reader, writer := io.Pipe()

        cmd.Stdin = in
        cmd.Stdout = writer

        var errBuf bytes.Buffer
        cmd.Stderr = &errBuf

        // Run the command and return the pipe
        if err := cmd.Start(); err != nil {
                return nil, err
        }

        // Ensure the command has exited before we clean anything up
        done := make(chan struct{})

        // Copy stdout to the returned pipe
        go func() {
                if err := cmd.Wait(); err != nil {
                        _ = writer.CloseWithError(fmt.Errorf("%w: %s", err, errBuf.String()))
                } else {
                        _ = writer.Close()
                }
                close(done)
        }()

        return &readCloserWrapper{
                Reader: reader,
                closer: func() error {
                        // Close pipeR, and then wait for the command to complete before returning. We have to close pipeR first, as
                        // cmd.Wait waits for any non-file stdout/stderr/stdin to close.
                        err := reader.Close()
                        <-done
                        return err
                },
        }, nil
}

package compression

import (
        "bytes"
        "encoding/binary"
)

const (
        zstdMagicSkippableStart = 0x184D2A50
        zstdMagicSkippableMask  = 0xFFFFFFF0
)

var (
        bzip2Magic = []byte{0x42, 0x5A, 0x68}
        gzipMagic  = []byte{0x1F, 0x8B, 0x08}
        xzMagic    = []byte{0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00}
        zstdMagic  = []byte{0x28, 0xb5, 0x2f, 0xfd}
)

type matcher = func([]byte) bool

// Detect detects the compression algorithm of the source.
func Detect(source []byte) Compression {
        compressionMap := map[Compression]matcher{
                Bzip2: magicNumberMatcher(bzip2Magic),
                Gzip:  magicNumberMatcher(gzipMagic),
                Xz:    magicNumberMatcher(xzMagic),
                Zstd:  zstdMatcher(),
        }
        for _, compression := range []Compression{Bzip2, Gzip, Xz, Zstd} {
                fn := compressionMap[compression]
                if fn(source) {
                        return compression
                }
        }
        return None
}

func magicNumberMatcher(m []byte) matcher {
        return func(source []byte) bool {
                return bytes.HasPrefix(source, m)
        }
}

// zstdMatcher detects zstd compression algorithm.
// Zstandard compressed data is made of one or more frames.
// There are two frame formats defined by Zstandard: Zstandard frames and Skippable frames.
// See https://datatracker.ietf.org/doc/html/rfc8878#section-3 for more details.
func zstdMatcher() matcher {
        return func(source []byte) bool {
                if bytes.HasPrefix(source, zstdMagic) {
                        // Zstandard frame
                        return true
                }
                // skippable frame
                if len(source) < 8 {
                        return false
                }
                // magic number from 0x184D2A50 to 0x184D2A5F.
                if binary.LittleEndian.Uint32(source[:4])&zstdMagicSkippableMask == zstdMagicSkippableStart {
                        return true
                }
                return false
        }
}

package archive

import (
        "archive/tar"
        "context"
        "errors"
        "io"
        "os"
        "path/filepath"
        "strings"
        "sync"

        "github.com/containerd/log"
)

// Errors used or returned by this file.
var (
        ErrNotDirectory      = errors.New("not a directory")
        ErrDirNotExists      = errors.New("no such directory")
        ErrCannotCopyDir     = errors.New("cannot copy directory")
        ErrInvalidCopySource = errors.New("invalid copy source content")
)

var copyPool = sync.Pool{
        New: func() interface{} { s := make([]byte, 32*1024); return &s },
}

func copyWithBuffer(dst io.Writer, src io.Reader) error {
        buf := copyPool.Get().(*[]byte)
        _, err := io.CopyBuffer(dst, src, *buf)
        copyPool.Put(buf)
        return err
}

// PreserveTrailingDotOrSeparator returns the given cleaned path (after
// processing using any utility functions from the path or filepath stdlib
// packages) and appends a trailing `/.` or `/` if its corresponding  original
// path (from before being processed by utility functions from the path or
// filepath stdlib packages) ends with a trailing `/.` or `/`. If the cleaned
// path already ends in a `.` path segment, then another is not added. If the
// clean path already ends in a path separator, then another is not added.
func PreserveTrailingDotOrSeparator(cleanedPath string, originalPath string) string {
        // Ensure paths are in platform semantics
        cleanedPath = normalizePath(cleanedPath)
        originalPath = normalizePath(originalPath)

        if !specifiesCurrentDir(cleanedPath) && specifiesCurrentDir(originalPath) {
                if !hasTrailingPathSeparator(cleanedPath) {
                        // Add a separator if it doesn't already end with one (a cleaned
                        // path would only end in a separator if it is the root).
                        cleanedPath += string(filepath.Separator)
                }
                cleanedPath += "."
        }

        if !hasTrailingPathSeparator(cleanedPath) && hasTrailingPathSeparator(originalPath) {
                cleanedPath += string(filepath.Separator)
        }

        return cleanedPath
}

// assertsDirectory returns whether the given path is
// asserted to be a directory, i.e., the path ends with
// a trailing '/' or `/.`, assuming a path separator of `/`.
func assertsDirectory(path string) bool {
        return hasTrailingPathSeparator(path) || specifiesCurrentDir(path)
}

// hasTrailingPathSeparator returns whether the given
// path ends with the system's path separator character.
func hasTrailingPathSeparator(path string) bool {
        return len(path) > 0 && path[len(path)-1] == filepath.Separator
}

// specifiesCurrentDir returns whether the given path specifies
// a "current directory", i.e., the last path segment is `.`.
func specifiesCurrentDir(path string) bool {
        return filepath.Base(path) == "."
}

// SplitPathDirEntry splits the given path between its directory name and its
// basename by first cleaning the path but preserves a trailing "." if the
// original path specified the current directory.
func SplitPathDirEntry(path string) (dir, base string) {
        cleanedPath := filepath.Clean(filepath.FromSlash(path))

        if specifiesCurrentDir(path) {
                cleanedPath += string(os.PathSeparator) + "."
        }

        return filepath.Dir(cleanedPath), filepath.Base(cleanedPath)
}

// TarResource archives the resource described by the given CopyInfo to a Tar
// archive. A non-nil error is returned if sourcePath does not exist or is
// asserted to be a directory but exists as another type of file.
//
// This function acts as a convenient wrapper around TarWithOptions, which
// requires a directory as the source path. TarResource accepts either a
// directory or a file path and correctly sets the Tar options.
func TarResource(sourceInfo CopyInfo) (content io.ReadCloser, err error) {
        return TarResourceRebase(sourceInfo.Path, sourceInfo.RebaseName)
}

// TarResourceRebase is like TarResource but renames the first path element of
// items in the resulting tar archive to match the given rebaseName if not "".
func TarResourceRebase(sourcePath, rebaseName string) (content io.ReadCloser, _ error) {
        sourcePath = normalizePath(sourcePath)
        if _, err := os.Lstat(sourcePath); err != nil {
                // Catches the case where the source does not exist or is not a
                // directory if asserted to be a directory, as this also causes an
                // error.
                return nil, err
        }

        // Separate the source path between its directory and
        // the entry in that directory which we are archiving.
        sourceDir, sourceBase := SplitPathDirEntry(sourcePath)
        opts := TarResourceRebaseOpts(sourceBase, rebaseName)

        log.G(context.TODO()).Debugf("copying %q from %q", sourceBase, sourceDir)
        return TarWithOptions(sourceDir, opts)
}

// TarResourceRebaseOpts does not preform the Tar, but instead just creates the rebase
// parameters to be sent to TarWithOptions (the TarOptions struct)
func TarResourceRebaseOpts(sourceBase string, rebaseName string) *TarOptions {
        filter := []string{sourceBase}
        return &TarOptions{
                IncludeFiles:     filter,
                IncludeSourceDir: true,
                RebaseNames: map[string]string{
                        sourceBase: rebaseName,
                },
        }
}

// CopyInfo holds basic info about the source
// or destination path of a copy operation.
type CopyInfo struct {
        Path       string
        Exists     bool
        IsDir      bool
        RebaseName string
}

// CopyInfoSourcePath stats the given path to create a CopyInfo
// struct representing that resource for the source of an archive copy
// operation. The given path should be an absolute local path. A source path
// has all symlinks evaluated that appear before the last path separator ("/"
// on Unix). As it is to be a copy source, the path must exist.
func CopyInfoSourcePath(path string, followLink bool) (CopyInfo, error) {
        // normalize the file path and then evaluate the symbol link
        // we will use the target file instead of the symbol link if
        // followLink is set
        path = normalizePath(path)

        resolvedPath, rebaseName, err := ResolveHostSourcePath(path, followLink)
        if err != nil {
                return CopyInfo{}, err
        }

        stat, err := os.Lstat(resolvedPath)
        if err != nil {
                return CopyInfo{}, err
        }

        return CopyInfo{
                Path:       resolvedPath,
                Exists:     true,
                IsDir:      stat.IsDir(),
                RebaseName: rebaseName,
        }, nil
}

// CopyInfoDestinationPath stats the given path to create a CopyInfo
// struct representing that resource for the destination of an archive copy
// operation. The given path should be an absolute local path.
func CopyInfoDestinationPath(path string) (info CopyInfo, err error) {
        maxSymlinkIter := 10 // filepath.EvalSymlinks uses 255, but 10 already seems like a lot.
        path = normalizePath(path)
        originalPath := path

        stat, err := os.Lstat(path)

        if err == nil && stat.Mode()&os.ModeSymlink == 0 {
                // The path exists and is not a symlink.
                return CopyInfo{
                        Path:   path,
                        Exists: true,
                        IsDir:  stat.IsDir(),
                }, nil
        }

        // While the path is a symlink.
        for n := 0; err == nil && stat.Mode()&os.ModeSymlink != 0; n++ {
                if n > maxSymlinkIter {
                        // Don't follow symlinks more than this arbitrary number of times.
                        return CopyInfo{}, errors.New("too many symlinks in " + originalPath)
                }

                // The path is a symbolic link. We need to evaluate it so that the
                // destination of the copy operation is the link target and not the
                // link itself. This is notably different than CopyInfoSourcePath which
                // only evaluates symlinks before the last appearing path separator.
                // Also note that it is okay if the last path element is a broken
                // symlink as the copy operation should create the target.
                var linkTarget string

                linkTarget, err = os.Readlink(path)
                if err != nil {
                        return CopyInfo{}, err
                }

                if !filepath.IsAbs(linkTarget) {
                        // Join with the parent directory.
                        dstParent, _ := SplitPathDirEntry(path)
                        linkTarget = filepath.Join(dstParent, linkTarget)
                }

                path = linkTarget
                stat, err = os.Lstat(path)
        }

        if err != nil {
                // It's okay if the destination path doesn't exist. We can still
                // continue the copy operation if the parent directory exists.
                if !os.IsNotExist(err) {
                        return CopyInfo{}, err
                }

                // Ensure destination parent dir exists.
                dstParent, _ := SplitPathDirEntry(path)

                parentDirStat, err := os.Stat(dstParent)
                if err != nil {
                        return CopyInfo{}, err
                }
                if !parentDirStat.IsDir() {
                        return CopyInfo{}, ErrNotDirectory
                }

                return CopyInfo{Path: path}, nil
        }

        // The path exists after resolving symlinks.
        return CopyInfo{
                Path:   path,
                Exists: true,
                IsDir:  stat.IsDir(),
        }, nil
}

// PrepareArchiveCopy prepares the given srcContent archive, which should
// contain the archived resource described by srcInfo, to the destination
// described by dstInfo. Returns the possibly modified content archive along
// with the path to the destination directory which it should be extracted to.
func PrepareArchiveCopy(srcContent io.Reader, srcInfo, dstInfo CopyInfo) (dstDir string, content io.ReadCloser, err error) {
        // Ensure in platform semantics
        srcInfo.Path = normalizePath(srcInfo.Path)
        dstInfo.Path = normalizePath(dstInfo.Path)

        // Separate the destination path between its directory and base
        // components in case the source archive contents need to be rebased.
        dstDir, dstBase := SplitPathDirEntry(dstInfo.Path)
        _, srcBase := SplitPathDirEntry(srcInfo.Path)

        switch {
        case dstInfo.Exists && dstInfo.IsDir:
                // The destination exists as a directory. No alteration
                // to srcContent is needed as its contents can be
                // simply extracted to the destination directory.
                return dstInfo.Path, io.NopCloser(srcContent), nil
        case dstInfo.Exists && srcInfo.IsDir:
                // The destination exists as some type of file and the source
                // content is a directory. This is an error condition since
                // you cannot copy a directory to an existing file location.
                return "", nil, ErrCannotCopyDir
        case dstInfo.Exists:
                // The destination exists as some type of file and the source content
                // is also a file. The source content entry will have to be renamed to
                // have a basename which matches the destination path's basename.
                if len(srcInfo.RebaseName) != 0 {
                        srcBase = srcInfo.RebaseName
                }
                return dstDir, RebaseArchiveEntries(srcContent, srcBase, dstBase), nil
        case srcInfo.IsDir:
                // The destination does not exist and the source content is an archive
                // of a directory. The archive should be extracted to the parent of
                // the destination path instead, and when it is, the directory that is
                // created as a result should take the name of the destination path.
                // The source content entries will have to be renamed to have a
                // basename which matches the destination path's basename.
                if len(srcInfo.RebaseName) != 0 {
                        srcBase = srcInfo.RebaseName
                }
                return dstDir, RebaseArchiveEntries(srcContent, srcBase, dstBase), nil
        case assertsDirectory(dstInfo.Path):
                // The destination does not exist and is asserted to be created as a
                // directory, but the source content is not a directory. This is an
                // error condition since you cannot create a directory from a file
                // source.
                return "", nil, ErrDirNotExists
        default:
                // The last remaining case is when the destination does not exist, is
                // not asserted to be a directory, and the source content is not an
                // archive of a directory. It this case, the destination file will need
                // to be created when the archive is extracted and the source content
                // entry will have to be renamed to have a basename which matches the
                // destination path's basename.
                if len(srcInfo.RebaseName) != 0 {
                        srcBase = srcInfo.RebaseName
                }
                return dstDir, RebaseArchiveEntries(srcContent, srcBase, dstBase), nil
        }
}

// RebaseArchiveEntries rewrites the given srcContent archive replacing
// an occurrence of oldBase with newBase at the beginning of entry names.
func RebaseArchiveEntries(srcContent io.Reader, oldBase, newBase string) io.ReadCloser {
        if oldBase == string(os.PathSeparator) {
                // If oldBase specifies the root directory, use an empty string as
                // oldBase instead so that newBase doesn't replace the path separator
                // that all paths will start with.
                oldBase = ""
        }

        rebased, w := io.Pipe()

        go func() {
                srcTar := tar.NewReader(srcContent)
                rebasedTar := tar.NewWriter(w)

                for {
                        hdr, err := srcTar.Next()
                        if errors.Is(err, io.EOF) {
                                // Signals end of archive.
                                rebasedTar.Close()
                                w.Close()
                                return
                        }
                        if err != nil {
                                w.CloseWithError(err)
                                return
                        }

                        // srcContent tar stream, as served by TarWithOptions(), is
                        // definitely in PAX format, but tar.Next() mistakenly guesses it
                        // as USTAR, which creates a problem: if the newBase is >100
                        // characters long, WriteHeader() returns an error like
                        // "archive/tar: cannot encode header: Format specifies USTAR; and USTAR cannot encode Name=...".
                        //
                        // To fix, set the format to PAX here. See docker/for-linux issue #484.
                        hdr.Format = tar.FormatPAX
                        hdr.Name = strings.Replace(hdr.Name, oldBase, newBase, 1)
                        if hdr.Typeflag == tar.TypeLink {
                                hdr.Linkname = strings.Replace(hdr.Linkname, oldBase, newBase, 1)
                        }

                        if err = rebasedTar.WriteHeader(hdr); err != nil {
                                w.CloseWithError(err)
                                return
                        }

                        // Ignoring GoSec G110. See https://github.com/securego/gosec/pull/433
                        // and https://cure53.de/pentest-report_opa.pdf, which recommends to
                        // replace io.Copy with io.CopyN7. The latter allows to specify the
                        // maximum number of bytes that should be read. By properly defining
                        // the limit, it can be assured that a GZip compression bomb cannot
                        // easily cause a Denial-of-Service.
                        // After reviewing with @tonistiigi and @cpuguy83, this should not
                        // affect us, because here we do not read into memory, hence should
                        // not be vulnerable to this code consuming memory.
                        //nolint:gosec // G110: Potential DoS vulnerability via decompression bomb (gosec)
                        if _, err = io.Copy(rebasedTar, srcTar); err != nil {
                                w.CloseWithError(err)
                                return
                        }
                }
        }()

        return rebased
}

// CopyResource performs an archive copy from the given source path to the
// given destination path. The source path MUST exist and the destination
// path's parent directory must exist.
func CopyResource(srcPath, dstPath string, followLink bool) error {
        var (
                srcInfo CopyInfo
                err     error
        )

        // Ensure in platform semantics
        srcPath = normalizePath(srcPath)
        dstPath = normalizePath(dstPath)

        // Clean the source and destination paths.
        srcPath = PreserveTrailingDotOrSeparator(filepath.Clean(srcPath), srcPath)
        dstPath = PreserveTrailingDotOrSeparator(filepath.Clean(dstPath), dstPath)

        if srcInfo, err = CopyInfoSourcePath(srcPath, followLink); err != nil {
                return err
        }

        content, err := TarResource(srcInfo)
        if err != nil {
                return err
        }
        defer content.Close()

        return CopyTo(content, srcInfo, dstPath)
}

// CopyTo handles extracting the given content whose
// entries should be sourced from srcInfo to dstPath.
func CopyTo(content io.Reader, srcInfo CopyInfo, dstPath string) error {
        // The destination path need not exist, but CopyInfoDestinationPath will
        // ensure that at least the parent directory exists.
        dstInfo, err := CopyInfoDestinationPath(normalizePath(dstPath))
        if err != nil {
                return err
        }

        dstDir, copyArchive, err := PrepareArchiveCopy(content, srcInfo, dstInfo)
        if err != nil {
                return err
        }
        defer copyArchive.Close()

        options := &TarOptions{
                NoLchown:             true,
                NoOverwriteDirNonDir: true,
        }

        return Untar(copyArchive, dstDir, options)
}

// ResolveHostSourcePath decides real path need to be copied with parameters such as
// whether to follow symbol link or not, if followLink is true, resolvedPath will return
// link target of any symbol link file, else it will only resolve symlink of directory
// but return symbol link file itself without resolving.
func ResolveHostSourcePath(path string, followLink bool) (resolvedPath, rebaseName string, _ error) {
        if followLink {
                var err error
                resolvedPath, err = filepath.EvalSymlinks(path)
                if err != nil {
                        return "", "", err
                }

                resolvedPath, rebaseName = GetRebaseName(path, resolvedPath)
        } else {
                dirPath, basePath := filepath.Split(path)

                // if not follow symbol link, then resolve symbol link of parent dir
                resolvedDirPath, err := filepath.EvalSymlinks(dirPath)
                if err != nil {
                        return "", "", err
                }
                // resolvedDirPath will have been cleaned (no trailing path separators) so
                // we can manually join it with the base path element.
                resolvedPath = resolvedDirPath + string(filepath.Separator) + basePath
                if hasTrailingPathSeparator(path) &&
                        filepath.Base(path) != filepath.Base(resolvedPath) {
                        rebaseName = filepath.Base(path)
                }
        }
        return resolvedPath, rebaseName, nil
}

// GetRebaseName normalizes and compares path and resolvedPath,
// return completed resolved path and rebased file name
func GetRebaseName(path, resolvedPath string) (string, string) {
        // linkTarget will have been cleaned (no trailing path separators and dot) so
        // we can manually join it with them
        var rebaseName string
        if specifiesCurrentDir(path) &&
                !specifiesCurrentDir(resolvedPath) {
                resolvedPath += string(filepath.Separator) + "."
        }

        if hasTrailingPathSeparator(path) &&
                !hasTrailingPathSeparator(resolvedPath) {
                resolvedPath += string(filepath.Separator)
        }

        if filepath.Base(path) != filepath.Base(resolvedPath) {
                // In the case where the path had a trailing separator and a symlink
                // evaluation has changed the last path component, we will need to
                // rebase the name in the archive that is being copied to match the
                // originally requested name.
                rebaseName = filepath.Base(path)
        }
        return resolvedPath, rebaseName
}

//go:build !windows

package archive

import (
        "path/filepath"
)

func normalizePath(path string) string {
        return filepath.ToSlash(path)
}

//go:build !windows && !freebsd

package archive

import "golang.org/x/sys/unix"

func mknod(path string, mode uint32, dev uint64) error {
        return unix.Mknod(path, mode, int(dev))
}

package archive

import (
        "archive/tar"
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "runtime"
        "strings"

        "github.com/containerd/log"

        "github.com/moby/go-archive/compression"
)

// UnpackLayer unpack `layer` to a `dest`. The stream `layer` can be
// compressed or uncompressed.
// Returns the size in bytes of the contents of the layer.
func UnpackLayer(dest string, layer io.Reader, options *TarOptions) (size int64, err error) {
        tr := tar.NewReader(layer)

        var dirs []*tar.Header
        unpackedPaths := make(map[string]struct{})

        if options == nil {
                options = &TarOptions{}
        }
        if options.ExcludePatterns == nil {
                options.ExcludePatterns = []string{}
        }

        aufsTempdir := ""
        aufsHardlinks := make(map[string]*tar.Header)

        // Iterate through the files in the archive.
        for {
                hdr, err := tr.Next()
                if errors.Is(err, io.EOF) {
                        // end of tar archive
                        break
                }
                if err != nil {
                        return 0, err
                }

                size += hdr.Size

                // Normalize name, for safety and for a simple is-root check
                hdr.Name = filepath.Clean(hdr.Name)

                // Windows does not support filenames with colons in them. Ignore
                // these files. This is not a problem though (although it might
                // appear that it is). Let's suppose a client is running docker pull.
                // The daemon it points to is Windows. Would it make sense for the
                // client to be doing a docker pull Ubuntu for example (which has files
                // with colons in the name under /usr/share/man/man3)? No, absolutely
                // not as it would really only make sense that they were pulling a
                // Windows image. However, for development, it is necessary to be able
                // to pull Linux images which are in the repository.
                //
                // TODO Windows. Once the registry is aware of what images are Windows-
                // specific or Linux-specific, this warning should be changed to an error
                // to cater for the situation where someone does manage to upload a Linux
                // image but have it tagged as Windows inadvertently.
                if runtime.GOOS == "windows" {
                        if strings.Contains(hdr.Name, ":") {
                                log.G(context.TODO()).Warnf("Windows: Ignoring %s (is this a Linux image?)", hdr.Name)
                                continue
                        }
                }

                // Ensure that the parent directory exists.
                err = createImpliedDirectories(dest, hdr, options)
                if err != nil {
                        return 0, err
                }

                // Skip AUFS metadata dirs
                if strings.HasPrefix(hdr.Name, WhiteoutMetaPrefix) {
                        // Regular files inside /.wh..wh.plnk can be used as hardlink targets
                        // We don't want this directory, but we need the files in them so that
                        // such hardlinks can be resolved.
                        if strings.HasPrefix(hdr.Name, WhiteoutLinkDir) && hdr.Typeflag == tar.TypeReg {
                                basename := filepath.Base(hdr.Name)
                                aufsHardlinks[basename] = hdr
                                if aufsTempdir == "" {
                                        if aufsTempdir, err = os.MkdirTemp(dest, "dockerplnk"); err != nil {
                                                return 0, err
                                        }
                                        defer os.RemoveAll(aufsTempdir)
                                }
                                if err := createTarFile(filepath.Join(aufsTempdir, basename), dest, hdr, tr, options); err != nil {
                                        return 0, err
                                }
                        }

                        if hdr.Name != WhiteoutOpaqueDir {
                                continue
                        }
                }
                // #nosec G305 -- The joined path is guarded against path traversal.
                path := filepath.Join(dest, hdr.Name)
                rel, err := filepath.Rel(dest, path)
                if err != nil {
                        return 0, err
                }

                // Note as these operations are platform specific, so must the slash be.
                if strings.HasPrefix(rel, ".."+string(os.PathSeparator)) {
                        return 0, breakoutError(fmt.Errorf("%q is outside of %q", hdr.Name, dest))
                }
                base := filepath.Base(path)

                if strings.HasPrefix(base, WhiteoutPrefix) {
                        dir := filepath.Dir(path)
                        if base == WhiteoutOpaqueDir {
                                _, err := os.Lstat(dir)
                                if err != nil {
                                        return 0, err
                                }
                                err = filepath.WalkDir(dir, func(path string, info os.DirEntry, err error) error {
                                        if err != nil {
                                                if os.IsNotExist(err) {
                                                        err = nil // parent was deleted
                                                }
                                                return err
                                        }
                                        if path == dir {
                                                return nil
                                        }
                                        if _, exists := unpackedPaths[path]; !exists {
                                                return os.RemoveAll(path)
                                        }
                                        return nil
                                })
                                if err != nil {
                                        return 0, err
                                }
                        } else {
                                originalBase := base[len(WhiteoutPrefix):]
                                originalPath := filepath.Join(dir, originalBase)
                                if err := os.RemoveAll(originalPath); err != nil {
                                        return 0, err
                                }
                        }
                } else {
                        // If path exits we almost always just want to remove and replace it.
                        // The only exception is when it is a directory *and* the file from
                        // the layer is also a directory. Then we want to merge them (i.e.
                        // just apply the metadata from the layer).
                        if fi, err := os.Lstat(path); err == nil {
                                if !fi.IsDir() || hdr.Typeflag != tar.TypeDir {
                                        if err := os.RemoveAll(path); err != nil {
                                                return 0, err
                                        }
                                }
                        }

                        srcData := io.Reader(tr)
                        srcHdr := hdr

                        // Hard links into /.wh..wh.plnk don't work, as we don't extract that directory, so
                        // we manually retarget these into the temporary files we extracted them into
                        if hdr.Typeflag == tar.TypeLink && strings.HasPrefix(filepath.Clean(hdr.Linkname), WhiteoutLinkDir) {
                                linkBasename := filepath.Base(hdr.Linkname)
                                srcHdr = aufsHardlinks[linkBasename]
                                if srcHdr == nil {
                                        return 0, errors.New("invalid aufs hardlink")
                                }
                                tmpFile, err := os.Open(filepath.Join(aufsTempdir, linkBasename))
                                if err != nil {
                                        return 0, err
                                }
                                defer tmpFile.Close()
                                srcData = tmpFile
                        }

                        if err := remapIDs(options.IDMap, srcHdr); err != nil {
                                return 0, err
                        }

                        if err := createTarFile(path, dest, srcHdr, srcData, options); err != nil {
                                return 0, err
                        }

                        // Directory mtimes must be handled at the end to avoid further
                        // file creation in them to modify the directory mtime
                        if hdr.Typeflag == tar.TypeDir {
                                dirs = append(dirs, hdr)
                        }
                        unpackedPaths[path] = struct{}{}
                }
        }

        for _, hdr := range dirs {
                // #nosec G305 -- The header was checked for path traversal before it was appended to the dirs slice.
                path := filepath.Join(dest, hdr.Name)
                if err := chtimes(path, hdr.AccessTime, hdr.ModTime); err != nil {
                        return 0, err
                }
        }

        return size, nil
}

// ApplyLayer parses a diff in the standard layer format from `layer`,
// and applies it to the directory `dest`. The stream `layer` can be
// compressed or uncompressed.
// Returns the size in bytes of the contents of the layer.
func ApplyLayer(dest string, layer io.Reader) (int64, error) {
        return applyLayerHandler(dest, layer, &TarOptions{}, true)
}

// ApplyUncompressedLayer parses a diff in the standard layer format from
// `layer`, and applies it to the directory `dest`. The stream `layer`
// can only be uncompressed.
// Returns the size in bytes of the contents of the layer.
func ApplyUncompressedLayer(dest string, layer io.Reader, options *TarOptions) (int64, error) {
        return applyLayerHandler(dest, layer, options, false)
}

// IsEmpty checks if the tar archive is empty (doesn't contain any entries).
func IsEmpty(rd io.Reader) (bool, error) {
        decompRd, err := compression.DecompressStream(rd)
        if err != nil {
                return true, fmt.Errorf("failed to decompress archive: %w", err)
        }
        defer decompRd.Close()

        tarReader := tar.NewReader(decompRd)
        if _, err := tarReader.Next(); err != nil {
                if errors.Is(err, io.EOF) {
                        return true, nil
                }
                return false, fmt.Errorf("failed to read next archive header: %w", err)
        }

        return false, nil
}

// do the bulk load of ApplyLayer, but allow for not calling DecompressStream
func applyLayerHandler(dest string, layer io.Reader, options *TarOptions, decompress bool) (int64, error) {
        dest = filepath.Clean(dest)

        // We need to be able to set any perms
        restore := overrideUmask(0)
        defer restore()

        if decompress {
                decompLayer, err := compression.DecompressStream(layer)
                if err != nil {
                        return 0, err
                }
                defer decompLayer.Close()
                layer = decompLayer
        }
        return UnpackLayer(dest, layer, options)
}

//go:build !windows

package archive

import "golang.org/x/sys/unix"

// overrideUmask sets current process's file mode creation mask to newmask
// and returns a function to restore it.
//
// WARNING for readers stumbling upon this code. Changing umask in a multi-
// threaded environment isn't safe. Don't use this without understanding the
// risks, and don't export this function for others to use (we shouldn't even
// be using this ourself).
//
// FIXME(thaJeztah): we should get rid of these hacks if possible.
func overrideUmask(newMask int) func() {
        oldMask := unix.Umask(newMask)
        return func() {
                unix.Umask(oldMask)
        }
}

package archive

// CheckSystemDriveAndRemoveDriveLetter verifies that a path, if it includes a drive letter,
// is the system drive.
// On Linux: this is a no-op.
// On Windows: this does the following>
// CheckSystemDriveAndRemoveDriveLetter verifies and manipulates a Windows path.
// This is used, for example, when validating a user provided path in docker cp.
// If a drive letter is supplied, it must be the system drive. The drive letter
// is always removed. Also, it translates it to OS semantics (IOW / to \). We
// need the path in this syntax so that it can ultimately be concatenated with
// a Windows long-path which doesn't support drive-letters. Examples:
// C:                        --> Fail
// C:\                        --> \
// a                        --> a
// /a                        --> \a
// d:\                        --> Fail
func CheckSystemDriveAndRemoveDriveLetter(path string) (string, error) {
        return checkSystemDriveAndRemoveDriveLetter(path)
}

//go:build !windows

package archive

// checkSystemDriveAndRemoveDriveLetter is the non-Windows implementation
// of CheckSystemDriveAndRemoveDriveLetter
func checkSystemDriveAndRemoveDriveLetter(path string) (string, error) {
        return path, nil
}

package archive

import (
        "syscall"
        "time"
        "unsafe"
)

var (
        minTime = time.Unix(0, 0)
        maxTime time.Time
)

func init() {
        if unsafe.Sizeof(syscall.Timespec{}.Nsec) == 8 {
                // This is a 64 bit timespec
                // os.Chtimes limits time to the following
                maxTime = time.Unix(0, 1<<63-1)
        } else {
                // This is a 32 bit timespec
                maxTime = time.Unix(1<<31-1, 0)
        }
}

func boundTime(t time.Time) time.Time {
        if t.Before(minTime) || t.After(maxTime) {
                return minTime
        }

        return t
}

func latestTime(t1, t2 time.Time) time.Time {
        if t1.Before(t2) {
                return t2
        }
        return t1
}

//go:build !windows

package archive

import (
        "os"
        "time"

        "golang.org/x/sys/unix"
)

// chtimes changes the access time and modified time of a file at the given path.
// If the modified time is prior to the Unix Epoch (unixMinTime), or after the
// end of Unix Time (unixEpochTime), os.Chtimes has undefined behavior. In this
// case, Chtimes defaults to Unix Epoch, just in case.
func chtimes(name string, atime time.Time, mtime time.Time) error {
        return os.Chtimes(name, atime, mtime)
}

func timeToTimespec(time time.Time) unix.Timespec {
        if time.IsZero() {
                // Return UTIME_OMIT special value
                return unix.Timespec{
                        Sec:  0,
                        Nsec: (1 << 30) - 2,
                }
        }
        return unix.NsecToTimespec(time.UnixNano())
}

func lchtimes(name string, atime time.Time, mtime time.Time) error {
        utimes := [2]unix.Timespec{
                timeToTimespec(atime),
                timeToTimespec(mtime),
        }
        err := unix.UtimesNanoAt(unix.AT_FDCWD, name, utimes[0:], unix.AT_SYMLINK_NOFOLLOW)
        if err != nil && err != unix.ENOSYS {
                return err
        }
        return err
}

package archive

import (
        "archive/tar"
        "bytes"
        "io"
)

// Generate generates a new archive from the content provided
// as input.
//
// `files` is a sequence of path/content pairs. A new file is
// added to the archive for each pair.
// If the last pair is incomplete, the file is created with an
// empty content. For example:
//
// Generate("foo.txt", "hello world", "emptyfile")
//
// The above call will return an archive with 2 files:
//   - ./foo.txt with content "hello world"
//   - ./empty with empty content
//
// FIXME: stream content instead of buffering
// FIXME: specify permissions and other archive metadata
func Generate(input ...string) (io.Reader, error) {
        files := parseStringPairs(input...)
        buf := new(bytes.Buffer)
        tw := tar.NewWriter(buf)
        for _, file := range files {
                name, content := file[0], file[1]
                hdr := &tar.Header{
                        Name: name,
                        Size: int64(len(content)),
                }
                if err := tw.WriteHeader(hdr); err != nil {
                        return nil, err
                }
                if _, err := tw.Write([]byte(content)); err != nil {
                        return nil, err
                }
        }
        if err := tw.Close(); err != nil {
                return nil, err
        }
        return buf, nil
}

func parseStringPairs(input ...string) [][2]string {
        output := make([][2]string, 0, len(input)/2+1)
        for i := 0; i < len(input); i += 2 {
                var pair [2]string
                pair[0] = input[i]
                if i+1 < len(input) {
                        pair[1] = input[i+1]
                }
                output = append(output, pair)
        }
        return output
}

//go:build linux || darwin || freebsd || netbsd

package archive

import (
        "errors"
        "fmt"
        "io/fs"

        "golang.org/x/sys/unix"
)

// lgetxattr retrieves the value of the extended attribute identified by attr
// and associated with the given path in the file system.
// It returns a nil slice and nil error if the xattr is not set.
func lgetxattr(path string, attr string) ([]byte, error) {
        // Start with a 128 length byte array
        dest := make([]byte, 128)
        sz, err := unix.Lgetxattr(path, attr, dest)

        for errors.Is(err, unix.ERANGE) {
                // Buffer too small, use zero-sized buffer to get the actual size
                sz, err = unix.Lgetxattr(path, attr, []byte{})
                if err != nil {
                        return nil, wrapPathError("lgetxattr", path, attr, err)
                }
                dest = make([]byte, sz)
                sz, err = unix.Lgetxattr(path, attr, dest)
        }

        if err != nil {
                if errors.Is(err, noattr) {
                        return nil, nil
                }
                return nil, wrapPathError("lgetxattr", path, attr, err)
        }

        return dest[:sz], nil
}

// lsetxattr sets the value of the extended attribute identified by attr
// and associated with the given path in the file system.
func lsetxattr(path string, attr string, data []byte, flags int) error {
        return wrapPathError("lsetxattr", path, attr, unix.Lsetxattr(path, attr, data, flags))
}

func wrapPathError(op, path, attr string, err error) error {
        if err == nil {
                return nil
        }
        return &fs.PathError{Op: op, Path: path, Err: fmt.Errorf("xattr %q: %w", attr, err)}
}

package build

import (
        "context"
        "fmt"
        "strconv"

        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/builder"
        daemonevents "github.com/docker/docker/daemon/events"
        buildkit "github.com/docker/docker/daemon/internal/builder-next"
        "github.com/docker/docker/image"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/build"
        "github.com/moby/moby/api/types/events"
        "github.com/pkg/errors"
        "google.golang.org/grpc"
)

// ImageComponent provides an interface for working with images
type ImageComponent interface {
        SquashImage(from string, to string) (string, error)
        TagImage(context.Context, image.ID, reference.Named) error
}

// Builder defines interface for running a build
type Builder interface {
        Build(context.Context, backend.BuildConfig) (*builder.Result, error)
}

// Backend provides build functionality to the API router
type Backend struct {
        builder        Builder
        imageComponent ImageComponent
        buildkit       *buildkit.Builder
        eventsService  *daemonevents.Events
}

// NewBackend creates a new build backend from components
func NewBackend(components ImageComponent, builder Builder, buildkit *buildkit.Builder, es *daemonevents.Events) (*Backend, error) {
        return &Backend{imageComponent: components, builder: builder, buildkit: buildkit, eventsService: es}, nil
}

// RegisterGRPC registers buildkit controller to the grpc server.
func (b *Backend) RegisterGRPC(s *grpc.Server) {
        if b.buildkit != nil {
                b.buildkit.RegisterGRPC(s)
        }
}

// Build builds an image from a Source
func (b *Backend) Build(ctx context.Context, config backend.BuildConfig) (string, error) {
        options := config.Options
        useBuildKit := options.Version == build.BuilderBuildKit

        tags, err := sanitizeRepoAndTags(options.Tags)
        if err != nil {
                return "", err
        }

        var buildResult *builder.Result
        if useBuildKit {
                buildResult, err = b.buildkit.Build(ctx, config)
                if err != nil {
                        return "", err
                }
        } else {
                buildResult, err = b.builder.Build(ctx, config)
                if err != nil {
                        return "", err
                }
        }

        if buildResult == nil {
                return "", nil
        }

        imageID := buildResult.ImageID
        if options.Squash {
                if imageID, err = squashBuild(buildResult, b.imageComponent); err != nil {
                        return "", err
                }
                if config.ProgressWriter.AuxFormatter != nil {
                        if err = config.ProgressWriter.AuxFormatter.Emit("moby.image.id", build.Result{ID: imageID}); err != nil {
                                return "", err
                        }
                }
        }

        if imageID != "" && !useBuildKit {
                stdout := config.ProgressWriter.StdoutFormatter
                _, _ = fmt.Fprintf(stdout, "Successfully built %s\n", stringid.TruncateID(imageID))
                err = tagImages(ctx, b.imageComponent, config.ProgressWriter.StdoutFormatter, image.ID(imageID), tags)
        }
        return imageID, err
}

// PruneCache removes all cached build sources
func (b *Backend) PruneCache(ctx context.Context, opts build.CachePruneOptions) (*build.CachePruneReport, error) {
        buildCacheSize, cacheIDs, err := b.buildkit.Prune(ctx, opts)
        if err != nil {
                return nil, errors.Wrap(err, "failed to prune build cache")
        }
        b.eventsService.Log(events.ActionPrune, events.BuilderEventType, events.Actor{
                Attributes: map[string]string{
                        "reclaimed": strconv.FormatInt(buildCacheSize, 10),
                },
        })
        return &build.CachePruneReport{SpaceReclaimed: uint64(buildCacheSize), CachesDeleted: cacheIDs}, nil
}

// Cancel cancels the build by ID
func (b *Backend) Cancel(ctx context.Context, id string) error {
        return b.buildkit.Cancel(ctx, id)
}

func squashBuild(build *builder.Result, imageComponent ImageComponent) (string, error) {
        var fromID string
        if build.FromImage != nil {
                fromID = build.FromImage.ImageID()
        }
        imageID, err := imageComponent.SquashImage(build.ImageID, fromID)
        if err != nil {
                return "", errors.Wrap(err, "error squashing image")
        }
        return imageID, nil
}

// Copyright 2022 Google LLC. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package build

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

func FuzzsanitizeRepoAndTags(data []byte) int {
        f := fuzz.NewConsumer(data)
        names := make([]string, 0)
        err := f.CreateSlice(&names)
        if err != nil {
                return 0
        }
        _, _ = sanitizeRepoAndTags(names)
        return 1
}

package build

import (
        "context"
        "fmt"
        "io"

        "github.com/distribution/reference"
        "github.com/docker/docker/image"
        "github.com/pkg/errors"
)

// tagImages creates image tags for the imageID.
func tagImages(ctx context.Context, ic ImageComponent, stdout io.Writer, imageID image.ID, repoAndTags []reference.Named) error {
        for _, rt := range repoAndTags {
                if err := ic.TagImage(ctx, imageID, rt); err != nil {
                        return err
                }
                _, _ = fmt.Fprintln(stdout, "Successfully tagged", reference.FamiliarString(rt))
        }
        return nil
}

// sanitizeRepoAndTags parses the raw "t" parameter received from the client
// to a slice of repoAndTag. It removes duplicates, and validates each name
// to not contain a digest.
func sanitizeRepoAndTags(names []string) (repoAndTags []reference.Named, _ error) {
        uniqNames := map[string]struct{}{}
        for _, repo := range names {
                if repo == "" {
                        continue
                }

                ref, err := reference.ParseNormalizedNamed(repo)
                if err != nil {
                        return nil, err
                }

                if _, ok := ref.(reference.Digested); ok {
                        return nil, errors.New("build tag cannot contain a digest")
                }

                ref = reference.TagNameOnly(ref)
                nameWithTag := ref.String()
                if _, exists := uniqNames[nameWithTag]; !exists {
                        uniqNames[nameWithTag] = struct{}{}
                        repoAndTags = append(repoAndTags, ref)
                }
        }
        return repoAndTags, nil
}

package dockerfile

import (
        "fmt"
        "io"
        "sort"
)

// builtinAllowedBuildArgs is list of built-in allowed build args
// these args are considered transparent and are excluded from the image history.
// Filtering from history is implemented in dispatchers.go
var builtinAllowedBuildArgs = map[string]bool{
        "HTTP_PROXY":  true,
        "http_proxy":  true,
        "HTTPS_PROXY": true,
        "https_proxy": true,
        "FTP_PROXY":   true,
        "ftp_proxy":   true,
        "NO_PROXY":    true,
        "no_proxy":    true,
        "ALL_PROXY":   true,
        "all_proxy":   true,
}

// BuildArgs manages arguments used by the builder
type BuildArgs struct {
        // args that are allowed for expansion/substitution and passing to commands in 'run'.
        allowedBuildArgs map[string]*string
        // args defined before the first `FROM` in a Dockerfile
        allowedMetaArgs map[string]*string
        // args referenced by the Dockerfile
        referencedArgs map[string]struct{}
        // args provided by the user on the command line
        argsFromOptions map[string]*string
}

// NewBuildArgs creates a new BuildArgs type
func NewBuildArgs(argsFromOptions map[string]*string) *BuildArgs {
        return &BuildArgs{
                allowedBuildArgs: make(map[string]*string),
                allowedMetaArgs:  make(map[string]*string),
                referencedArgs:   make(map[string]struct{}),
                argsFromOptions:  argsFromOptions,
        }
}

// Clone returns a copy of the BuildArgs type
func (b *BuildArgs) Clone() *BuildArgs {
        result := NewBuildArgs(b.argsFromOptions)
        for k, v := range b.allowedBuildArgs {
                result.allowedBuildArgs[k] = v
        }
        for k, v := range b.allowedMetaArgs {
                result.allowedMetaArgs[k] = v
        }
        for k := range b.referencedArgs {
                result.referencedArgs[k] = struct{}{}
        }
        return result
}

// MergeReferencedArgs merges referenced args from another BuildArgs
// object into the current one
func (b *BuildArgs) MergeReferencedArgs(other *BuildArgs) {
        for k := range other.referencedArgs {
                b.referencedArgs[k] = struct{}{}
        }
}

// WarnOnUnusedBuildArgs checks if there are any leftover build-args that were
// passed but not consumed during build. Print a warning, if there are any.
func (b *BuildArgs) WarnOnUnusedBuildArgs(out io.Writer) {
        var leftoverArgs []string
        for arg := range b.argsFromOptions {
                _, isReferenced := b.referencedArgs[arg]
                _, isBuiltin := builtinAllowedBuildArgs[arg]
                if !isBuiltin && !isReferenced {
                        leftoverArgs = append(leftoverArgs, arg)
                }
        }
        if len(leftoverArgs) > 0 {
                sort.Strings(leftoverArgs)
                fmt.Fprintf(out, "[Warning] One or more build-args %v were not consumed\n", leftoverArgs)
        }
}

// ResetAllowed clears the list of args that are allowed to be used by a
// directive
func (b *BuildArgs) ResetAllowed() {
        b.allowedBuildArgs = make(map[string]*string)
}

// AddMetaArg adds a new meta arg that can be used by FROM directives
func (b *BuildArgs) AddMetaArg(key string, value *string) {
        b.allowedMetaArgs[key] = value
}

// AddArg adds a new arg that can be used by directives
func (b *BuildArgs) AddArg(key string, value *string) {
        b.allowedBuildArgs[key] = value
        b.referencedArgs[key] = struct{}{}
}

// IsReferencedOrNotBuiltin checks if the key is a built-in arg, or if it has been
// referenced by the Dockerfile. Returns true if the arg is not a builtin or
// if the builtin has been referenced in the Dockerfile.
func (b *BuildArgs) IsReferencedOrNotBuiltin(key string) bool {
        _, isBuiltin := builtinAllowedBuildArgs[key]
        _, isAllowed := b.allowedBuildArgs[key]
        return isAllowed || !isBuiltin
}

// GetAllAllowed returns a mapping with all the allowed args
func (b *BuildArgs) GetAllAllowed() map[string]string {
        return b.getAllFromMapping(b.allowedBuildArgs)
}

// GetAllMeta returns a mapping with all the meta args
func (b *BuildArgs) GetAllMeta() map[string]string {
        return b.getAllFromMapping(b.allowedMetaArgs)
}

func (b *BuildArgs) getAllFromMapping(source map[string]*string) map[string]string {
        m := make(map[string]string)

        keys := keysFromMaps(source, builtinAllowedBuildArgs)
        for _, key := range keys {
                v, ok := b.getBuildArg(key, source)
                if ok {
                        m[key] = v
                }
        }
        return m
}

// FilterAllowed returns all allowed args without the filtered args
func (b *BuildArgs) FilterAllowed(filter []string) []string {
        envs := []string{}
        configEnv := convertKVStringsToMap(filter)

        for key, val := range b.GetAllAllowed() {
                if _, ok := configEnv[key]; !ok {
                        envs = append(envs, fmt.Sprintf("%s=%s", key, val))
                }
        }
        return envs
}

func (b *BuildArgs) getBuildArg(key string, mapping map[string]*string) (string, bool) {
        defaultValue, exists := mapping[key]
        // Return override from options if one is defined
        if v, ok := b.argsFromOptions[key]; ok && v != nil {
                return *v, ok
        }

        if defaultValue == nil {
                if v, ok := b.allowedMetaArgs[key]; ok && v != nil {
                        return *v, ok
                }
                return "", false
        }
        return *defaultValue, exists
}

func keysFromMaps(source map[string]*string, builtin map[string]bool) []string {
        keys := []string{}
        for key := range source {
                keys = append(keys, key)
        }
        for key := range builtin {
                keys = append(keys, key)
        }
        return keys
}

package dockerfile

import (
        "bytes"
        "context"
        "fmt"
        "io"
        "sort"
        "strings"

        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/daemon/builder/remotecontext"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/buildkit/frontend/dockerfile/instructions"
        "github.com/moby/buildkit/frontend/dockerfile/parser"
        "github.com/moby/buildkit/frontend/dockerfile/shell"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/build"
        "github.com/moby/moby/api/types/container"
        "github.com/moby/sys/user"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
        "golang.org/x/sync/syncmap"
)

var validCommitCommands = map[string]bool{
        "cmd":         true,
        "entrypoint":  true,
        "healthcheck": true,
        "env":         true,
        "expose":      true,
        "label":       true,
        "onbuild":     true,
        "stopsignal":  true,
        "user":        true,
        "volume":      true,
        "workdir":     true,
}

const (
        stepFormat = "Step %d/%d : %v"
)

// BuildManager is shared across all Builder objects
type BuildManager struct {
        idMapping user.IdentityMapping
        backend   builder.Backend
        pathCache pathCache // TODO: make this persistent
}

// NewBuildManager creates a BuildManager
func NewBuildManager(b builder.Backend, identityMapping user.IdentityMapping) (*BuildManager, error) {
        bm := &BuildManager{
                backend:   b,
                pathCache: &syncmap.Map{},
                idMapping: identityMapping,
        }
        return bm, nil
}

// Build starts a new build from a BuildConfig
func (bm *BuildManager) Build(ctx context.Context, config backend.BuildConfig) (*builder.Result, error) {
        buildsTriggered.Inc()
        if config.Options.Dockerfile == "" {
                config.Options.Dockerfile = builder.DefaultDockerfileName
        }

        source, dockerfile, err := remotecontext.Detect(config)
        if err != nil {
                return nil, err
        }
        defer func() {
                if source != nil {
                        if err := source.Close(); err != nil {
                                log.G(ctx).Debugf("[BUILDER] failed to remove temporary context: %v", err)
                        }
                }
        }()

        ctx, cancel := context.WithCancel(ctx)
        defer cancel()

        b, err := newBuilder(ctx, builderOptions{
                Options:        config.Options,
                ProgressWriter: config.ProgressWriter,
                Backend:        bm.backend,
                PathCache:      bm.pathCache,
                IDMapping:      bm.idMapping,
        })
        if err != nil {
                return nil, err
        }
        return b.build(ctx, source, dockerfile)
}

// builderOptions are the dependencies required by the builder
type builderOptions struct {
        Options        *build.ImageBuildOptions
        Backend        builder.Backend
        ProgressWriter backend.ProgressWriter
        PathCache      pathCache
        IDMapping      user.IdentityMapping
}

// Builder is a Dockerfile builder
// It implements the builder.Backend interface.
type Builder struct {
        options *build.ImageBuildOptions

        Stdout io.Writer
        Stderr io.Writer
        Aux    backend.AuxEmitter
        Output io.Writer

        docker builder.Backend

        idMapping        user.IdentityMapping
        disableCommit    bool
        imageSources     *imageSources
        pathCache        pathCache
        containerManager *containerManager
        imageProber      ImageProber
        platform         *ocispec.Platform
}

// newBuilder creates a new Dockerfile builder from an optional dockerfile and a Options.
func newBuilder(ctx context.Context, options builderOptions) (*Builder, error) {
        config := options.Options
        if config == nil {
                config = new(build.ImageBuildOptions)
        }

        imgProber, err := newImageProber(ctx, options.Backend, config.CacheFrom, config.NoCache)
        if err != nil {
                return nil, err
        }

        b := &Builder{
                options:          config,
                Stdout:           options.ProgressWriter.StdoutFormatter,
                Stderr:           options.ProgressWriter.StderrFormatter,
                Aux:              options.ProgressWriter.AuxFormatter,
                Output:           options.ProgressWriter.Output,
                docker:           options.Backend,
                idMapping:        options.IDMapping,
                imageSources:     newImageSources(options),
                pathCache:        options.PathCache,
                imageProber:      imgProber,
                containerManager: newContainerManager(options.Backend),
        }

        // same as in Builder.Build in builder/builder-next/builder.go
        // TODO: remove once config.Platform is of type specs.Platform
        if config.Platform != "" {
                sp, err := platforms.Parse(config.Platform)
                if err != nil {
                        return nil, errdefs.InvalidParameter(err)
                }
                b.platform = &sp
        }

        return b, nil
}

// Build 'LABEL' command(s) from '--label' options and add to the last stage
func buildLabelOptions(labels map[string]string, stages []instructions.Stage) {
        var keys []string
        for key := range labels {
                keys = append(keys, key)
        }

        // Sort the label to have a repeatable order
        sort.Strings(keys)
        for _, key := range keys {
                value := labels[key]
                stages[len(stages)-1].AddCommand(instructions.NewLabelCommand(key, value, true))
        }
}

// Build runs the Dockerfile builder by parsing the Dockerfile and executing
// the instructions from the file.
func (b *Builder) build(ctx context.Context, source builder.Source, dockerfile *parser.Result) (*builder.Result, error) {
        defer b.imageSources.Unmount()

        stages, metaArgs, err := instructions.Parse(dockerfile.AST, nil)
        if err != nil {
                var uiErr *instructions.UnknownInstructionError
                if errors.As(err, &uiErr) {
                        buildsFailed.WithValues(metricsUnknownInstructionError).Inc()
                }
                return nil, errdefs.InvalidParameter(err)
        }
        if b.options.Target != "" {
                targetIx, found := instructions.HasStage(stages, b.options.Target)
                if !found {
                        buildsFailed.WithValues(metricsBuildTargetNotReachableError).Inc()
                        return nil, errdefs.InvalidParameter(errors.Errorf("target stage %q could not be found", b.options.Target))
                }
                stages = stages[:targetIx+1]
        }

        // Add 'LABEL' command specified by '--label' option to the last stage
        buildLabelOptions(b.options.Labels, stages)

        dockerfile.PrintWarnings(b.Stderr)
        state, err := b.dispatchDockerfileWithCancellation(ctx, stages, metaArgs, dockerfile.EscapeToken, source)
        if err != nil {
                return nil, err
        }
        if state.imageID == "" {
                buildsFailed.WithValues(metricsDockerfileEmptyError).Inc()
                return nil, errors.New("No image was generated. Is your Dockerfile empty?")
        }
        return &builder.Result{ImageID: state.imageID, FromImage: state.baseImage}, nil
}

func emitImageID(aux backend.AuxEmitter, state *dispatchState) error {
        if aux == nil || state.imageID == "" {
                return nil
        }
        return aux.Emit("", build.Result{ID: state.imageID})
}

func processMetaArg(meta instructions.ArgCommand, shlex *shell.Lex, args *BuildArgs) error {
        // shell.Lex currently only support the concatenated string format
        envs := shell.EnvsFromSlice(convertMapToEnvList(args.GetAllAllowed()))
        if err := meta.Expand(func(word string) (string, error) {
                newword, _, err := shlex.ProcessWord(word, envs)
                return newword, err
        }); err != nil {
                return err
        }
        for _, arg := range meta.Args {
                args.AddArg(arg.Key, arg.Value)
                args.AddMetaArg(arg.Key, arg.Value)
        }
        return nil
}

func printCommand(out io.Writer, currentCommandIndex int, totalCommands int, cmd interface{}) int {
        _, _ = fmt.Fprintf(out, stepFormat, currentCommandIndex, totalCommands, cmd)
        _, _ = fmt.Fprintln(out)
        return currentCommandIndex + 1
}

func (b *Builder) dispatchDockerfileWithCancellation(ctx context.Context, parseResult []instructions.Stage, metaArgs []instructions.ArgCommand, escapeToken rune, source builder.Source) (*dispatchState, error) {
        request := dispatchRequest{}
        buildArgs := NewBuildArgs(b.options.BuildArgs)
        totalCommands := len(metaArgs) + len(parseResult)
        currentCommandIndex := 1
        for _, stage := range parseResult {
                totalCommands += len(stage.Commands)
        }
        shlex := shell.NewLex(escapeToken)
        for i := range metaArgs {
                currentCommandIndex = printCommand(b.Stdout, currentCommandIndex, totalCommands, &metaArgs[i])

                err := processMetaArg(metaArgs[i], shlex, buildArgs)
                if err != nil {
                        return nil, err
                }
        }

        stagesResults := newStagesBuildResults()

        for _, s := range parseResult {
                stage := s
                if err := stagesResults.checkStageNameAvailable(stage.Name); err != nil {
                        return nil, err
                }
                request = newDispatchRequest(b, escapeToken, source, buildArgs, stagesResults)

                currentCommandIndex = printCommand(b.Stdout, currentCommandIndex, totalCommands, stage.SourceCode)
                if err := initializeStage(ctx, request, &stage); err != nil {
                        return nil, err
                }
                request.state.updateRunConfig()
                _, _ = fmt.Fprintf(b.Stdout, " ---> %s\n", stringid.TruncateID(request.state.imageID))
                for _, cmd := range stage.Commands {
                        select {
                        case <-ctx.Done():
                                log.G(ctx).Debug("Builder: build cancelled!")
                                _, _ = fmt.Fprint(b.Stdout, "Build cancelled\n")
                                buildsFailed.WithValues(metricsBuildCanceled).Inc()
                                return nil, errors.New("Build cancelled")
                        default:
                                // Not cancelled yet, keep going...
                        }

                        currentCommandIndex = printCommand(b.Stdout, currentCommandIndex, totalCommands, cmd)

                        if err := dispatch(ctx, request, cmd); err != nil {
                                return nil, err
                        }
                        request.state.updateRunConfig()
                        _, _ = fmt.Fprintf(b.Stdout, " ---> %s\n", stringid.TruncateID(request.state.imageID))
                }
                if err := emitImageID(b.Aux, request.state); err != nil {
                        return nil, err
                }
                buildArgs.MergeReferencedArgs(request.state.buildArgs)
                if err := commitStage(request.state, stagesResults); err != nil {
                        return nil, err
                }
        }
        buildArgs.WarnOnUnusedBuildArgs(b.Stdout)
        return request.state, nil
}

// BuildFromConfig builds directly from `changes`, treating it as if it were the contents of a Dockerfile
// It will:
// - Call parse.Parse() to get an AST root for the concatenated Dockerfile entries.
// - Do build by calling builder.dispatch() to call all entries' handling routines
//
// BuildFromConfig is used by the /commit endpoint, with the changes
// coming from the query parameter of the same name.
//
// TODO: Remove?
func BuildFromConfig(ctx context.Context, config *container.Config, changes []string, os string) (*container.Config, error) {
        if len(changes) == 0 {
                return config, nil
        }

        dockerfile, err := parser.Parse(bytes.NewBufferString(strings.Join(changes, "\n")))
        if err != nil {
                return nil, errdefs.InvalidParameter(err)
        }

        // ensure that the commands are valid
        var commands []instructions.Command
        for _, n := range dockerfile.AST.Children {
                if !validCommitCommands[strings.ToLower(n.Value)] {
                        return nil, errdefs.InvalidParameter(errors.Errorf("%s is not a valid change command", n.Value))
                }
                cmd, err := instructions.ParseCommand(n)
                if err != nil {
                        return nil, errdefs.InvalidParameter(err)
                }
                commands = append(commands, cmd)
        }

        b, err := newBuilder(ctx, builderOptions{
                Options: &build.ImageBuildOptions{NoCache: true},
        })
        if err != nil {
                return nil, err
        }

        b.Stdout = io.Discard
        b.Stderr = io.Discard
        b.disableCommit = true

        req := newDispatchRequest(b, dockerfile.EscapeToken, nil, NewBuildArgs(b.options.BuildArgs), newStagesBuildResults())
        // We make mutations to the configuration, ensure we have a copy
        req.state.runConfig = copyRunConfig(config)
        req.state.imageID = config.Image
        req.state.operatingSystem = os
        for _, cmd := range commands {
                err := dispatch(ctx, req, cmd)
                if err != nil {
                        return nil, errdefs.InvalidParameter(err)
                }
                req.state.updateRunConfig()
        }

        return req.state.runConfig, nil
}

func convertMapToEnvList(m map[string]string) []string {
        result := []string{}
        for k, v := range m {
                result = append(result, k+"="+v)
        }
        return result
}

// convertKVStringsToMap converts ["key=value"] to {"key":"value"}
func convertKVStringsToMap(values []string) map[string]string {
        result := make(map[string]string, len(values))
        for _, value := range values {
                k, v, _ := strings.Cut(value, "=")
                result[k] = v
        }

        return result
}

//go:build !windows

package dockerfile

func defaultShellForOS(os string) []string {
        return []string{"/bin/sh", "-c"}
}

package dockerfile

import (
        "context"
        "fmt"
        "io"

        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/container"
        "github.com/pkg/errors"
)

type containerManager struct {
        tmpContainers map[string]struct{}
        backend       builder.ExecBackend
}

// newContainerManager creates a new container backend
func newContainerManager(docker builder.ExecBackend) *containerManager {
        return &containerManager{
                backend:       docker,
                tmpContainers: make(map[string]struct{}),
        }
}

// Create a container
func (c *containerManager) Create(ctx context.Context, runConfig *container.Config, hostConfig *container.HostConfig) (container.CreateResponse, error) {
        ctr, err := c.backend.ContainerCreateIgnoreImagesArgsEscaped(ctx, backend.ContainerCreateConfig{
                Config:     runConfig,
                HostConfig: hostConfig,
        })
        if err != nil {
                return ctr, err
        }
        c.tmpContainers[ctr.ID] = struct{}{}
        return ctr, nil
}

var errCancelled = errors.New("build cancelled")

// Run a container by ID
func (c *containerManager) Run(ctx context.Context, cID string, stdout, stderr io.Writer) error {
        attached := make(chan struct{})
        errCh := make(chan error, 1)
        go func() {
                errCh <- c.backend.ContainerAttachRaw(cID, nil, stdout, stderr, true, attached)
        }()
        select {
        case err := <-errCh:
                return err
        case <-attached:
        }

        finished := make(chan struct{})
        cancelErrCh := make(chan error, 1)
        go func() {
                select {
                case <-ctx.Done():
                        log.G(ctx).Debugln("Build cancelled, removing container:", cID)
                        err := c.backend.ContainerRm(cID, &backend.ContainerRmConfig{ForceRemove: true, RemoveVolume: true})
                        if err != nil {
                                _, _ = fmt.Fprintf(stdout, "Removing container %s: %v\n", stringid.TruncateID(cID), err)
                        }
                        cancelErrCh <- errCancelled
                case <-finished:
                        cancelErrCh <- nil
                }
        }()

        if err := c.backend.ContainerStart(ctx, cID, "", ""); err != nil {
                close(finished)
                logCancellationError(cancelErrCh, "error from ContainerStart: "+err.Error())
                return err
        }

        // Block on reading output from container, stop on err or chan closed
        if err := <-errCh; err != nil {
                close(finished)
                logCancellationError(cancelErrCh, "error from errCh: "+err.Error())
                return err
        }

        waitC, err := c.backend.ContainerWait(ctx, cID, container.WaitConditionNotRunning)
        if err != nil {
                close(finished)
                logCancellationError(cancelErrCh, fmt.Sprintf("unable to begin ContainerWait: %s", err))
                return err
        }

        if status := <-waitC; status.ExitCode() != 0 {
                close(finished)
                logCancellationError(cancelErrCh, fmt.Sprintf("a non-zero code from ContainerWait: %d", status.ExitCode()))
                return &statusCodeError{code: status.ExitCode(), err: status.Err()}
        }

        close(finished)
        return <-cancelErrCh
}

func logCancellationError(cancelErrCh chan error, msg string) {
        if cancelErr := <-cancelErrCh; cancelErr != nil {
                log.G(context.TODO()).Debugf("Build cancelled (%v): %s", cancelErr, msg)
        }
}

type statusCodeError struct {
        code int
        err  error
}

func (e *statusCodeError) Error() string {
        if e.err == nil {
                return ""
        }
        return e.err.Error()
}

func (e *statusCodeError) StatusCode() int {
        return e.code
}

// RemoveAll containers managed by this container manager
func (c *containerManager) RemoveAll(stdout io.Writer) {
        for containerID := range c.tmpContainers {
                if err := c.backend.ContainerRm(containerID, &backend.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil && !cerrdefs.IsNotFound(err) {
                        _, _ = fmt.Fprintf(stdout, "Removing intermediate container %s: %v\n", stringid.TruncateID(containerID), err)
                        continue
                }
                delete(c.tmpContainers, containerID)
                _, _ = fmt.Fprintf(stdout, " ---> Removed intermediate container %s\n", stringid.TruncateID(containerID))
        }
}

package dockerfile

import (
        "context"
        "fmt"
        "io"
        "mime"
        "net/http"
        "net/url"
        "os"
        "path/filepath"
        "sort"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/daemon/builder/remotecontext"
        "github.com/docker/docker/daemon/builder/remotecontext/urlutil"
        "github.com/docker/docker/pkg/longpath"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/streamformatter"
        "github.com/docker/docker/pkg/system"
        "github.com/moby/buildkit/frontend/dockerfile/instructions"
        "github.com/moby/go-archive"
        "github.com/moby/sys/symlink"
        "github.com/moby/sys/user"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

const unnamedFilename = "__unnamed__"

type pathCache interface {
        Load(key interface{}) (value interface{}, ok bool)
        Store(key, value interface{})
}

// copyInfo is a data object which stores the metadata about each source file in
// a copyInstruction
type copyInfo struct {
        root         string
        path         string
        hash         string
        noDecompress bool
}

func (c copyInfo) fullPath() (string, error) {
        return symlink.FollowSymlinkInScope(filepath.Join(c.root, c.path), c.root)
}

func newCopyInfoFromSource(source builder.Source, path string, hash string) copyInfo {
        return copyInfo{root: source.Root(), path: path, hash: hash}
}

func newCopyInfos(copyInfos ...copyInfo) []copyInfo {
        return copyInfos
}

// copyInstruction is a fully parsed COPY or ADD command that is passed to
// Builder.performCopy to copy files into the image filesystem
type copyInstruction struct {
        cmdName                 string
        infos                   []copyInfo
        dest                    string
        chownStr                string
        allowLocalDecompression bool
        preserveOwnership       bool
}

// copier reads a raw COPY or ADD command, fetches remote sources using a downloader,
// and creates a copyInstruction
type copier struct {
        imageSource *imageMount
        source      builder.Source
        pathCache   pathCache
        download    sourceDownloader
        platform    ocispec.Platform
        // for cleanup. TODO: having copier.cleanup() is error prone and hard to
        // follow. Code calling performCopy should manage the lifecycle of its params.
        // Copier should take override source as input, not imageMount.
        activeLayer builder.RWLayer
        tmpPaths    []string
}

func copierFromDispatchRequest(req dispatchRequest, download sourceDownloader, imageSource *imageMount) copier {
        platform := req.builder.getPlatform(req.state)

        return copier{
                source:      req.source,
                pathCache:   req.builder.pathCache,
                download:    download,
                imageSource: imageSource,
                platform:    platform,
        }
}

func (o *copier) createCopyInstruction(sourcesAndDest instructions.SourcesAndDest, cmdName string) (copyInstruction, error) {
        inst := copyInstruction{
                cmdName: cmdName,
                dest:    filepath.FromSlash(sourcesAndDest.DestPath),
        }
        infos, err := o.getCopyInfosForSourcePaths(sourcesAndDest.SourcePaths, inst.dest)
        if err != nil {
                return inst, errors.Wrapf(err, "%s failed", cmdName)
        }
        if len(infos) > 1 && !strings.HasSuffix(inst.dest, string(os.PathSeparator)) {
                return inst, errors.Errorf("When using %s with more than one source file, the destination must be a directory and end with a /", cmdName)
        }
        inst.infos = infos
        return inst, nil
}

// getCopyInfosForSourcePaths iterates over the source files and calculate the info
// needed to copy (e.g. hash value if cached)
// The dest is used in case source is URL (and ends with "/")
func (o *copier) getCopyInfosForSourcePaths(sources []string, dest string) ([]copyInfo, error) {
        var infos []copyInfo
        for _, orig := range sources {
                subinfos, err := o.getCopyInfoForSourcePath(orig, dest)
                if err != nil {
                        return nil, err
                }
                infos = append(infos, subinfos...)
        }

        if len(infos) == 0 {
                return nil, errors.New("no source files were specified")
        }
        return infos, nil
}

func (o *copier) getCopyInfoForSourcePath(orig, dest string) ([]copyInfo, error) {
        if !urlutil.IsURL(orig) {
                return o.calcCopyInfo(orig, true)
        }

        remote, path, err := o.download(orig)
        if err != nil {
                return nil, err
        }
        // If path == "" then we are unable to determine filename from src
        // We have to make sure dest is available
        if path == "" {
                if strings.HasSuffix(dest, "/") {
                        return nil, errors.Errorf("cannot determine filename for source %s", orig)
                }
                path = unnamedFilename
        }
        o.tmpPaths = append(o.tmpPaths, remote.Root())

        hash, err := remote.Hash(path)
        ci := newCopyInfoFromSource(remote, path, hash)
        ci.noDecompress = true // data from http shouldn't be extracted even on ADD
        return newCopyInfos(ci), err
}

// Cleanup removes any temporary directories created as part of downloading
// remote files.
func (o *copier) Cleanup() {
        for _, path := range o.tmpPaths {
                os.RemoveAll(path)
        }
        o.tmpPaths = []string{}
        if o.activeLayer != nil {
                o.activeLayer.Release()
                o.activeLayer = nil
        }
}

// TODO: allowWildcards can probably be removed by refactoring this function further.
func (o *copier) calcCopyInfo(origPath string, allowWildcards bool) ([]copyInfo, error) {
        imageSource := o.imageSource
        if err := validateCopySourcePath(imageSource, origPath); err != nil {
                return nil, err
        }

        // TODO: do this when creating copier. Requires validateCopySourcePath
        // (and other below) to be aware of the difference sources. Why is it only
        // done on image Source?
        if imageSource != nil && o.activeLayer == nil {
                // this needs to be protected against repeated calls as wildcard copy
                // will call it multiple times for a single COPY
                var err error
                rwLayer, err := imageSource.NewRWLayer()
                if err != nil {
                        return nil, err
                }
                o.activeLayer = rwLayer

                o.source, err = remotecontext.NewLazySource(rwLayer.Root())
                if err != nil {
                        return nil, errors.Wrapf(err, "failed to create context for copy from %s", rwLayer.Root())
                }
        }

        if o.source == nil {
                return nil, errors.Errorf("missing build context")
        }

        // Work in daemon-specific OS filepath semantics
        origPath = filepath.FromSlash(origPath)
        origPath = strings.TrimPrefix(origPath, string(os.PathSeparator))
        origPath = strings.TrimPrefix(origPath, "."+string(os.PathSeparator))

        // Deal with wildcards
        if allowWildcards && containsWildcards(origPath) {
                return o.copyWithWildcards(origPath)
        }

        if imageSource != nil && imageSource.ImageID() != "" {
                // return a cached copy if one exists
                if h, ok := o.pathCache.Load(imageSource.ImageID() + origPath); ok {
                        return newCopyInfos(newCopyInfoFromSource(o.source, origPath, h.(string))), nil
                }
        }

        // Deal with the single file case
        info, err := copyInfoForFile(o.source, origPath)
        switch {
        case imageSource == nil && errors.Is(err, os.ErrNotExist):
                return nil, errors.Wrapf(err, "file not found in build context or excluded by .dockerignore")
        case err != nil:
                return nil, err
        case info.hash != "":
                o.storeInPathCache(imageSource, origPath, info.hash)
                return newCopyInfos(info), err
        }

        // TODO: remove, handle dirs in Hash()
        subfiles, err := walkSource(o.source, origPath)
        if err != nil {
                return nil, err
        }

        hash := hashStringSlice("dir", subfiles)
        o.storeInPathCache(imageSource, origPath, hash)
        return newCopyInfos(newCopyInfoFromSource(o.source, origPath, hash)), nil
}

func (o *copier) storeInPathCache(im *imageMount, path string, hash string) {
        if im != nil {
                o.pathCache.Store(im.ImageID()+path, hash)
        }
}

func (o *copier) copyWithWildcards(origPath string) ([]copyInfo, error) {
        root := o.source.Root()
        var copyInfos []copyInfo
        if err := filepath.WalkDir(root, func(path string, _ os.DirEntry, err error) error {
                if err != nil {
                        return err
                }
                rel, err := filepath.Rel(root, path)
                if err != nil {
                        return err
                }

                if rel == "." {
                        return nil
                }
                if match, _ := filepath.Match(origPath, rel); !match {
                        return nil
                }

                // Note we set allowWildcards to false in case the name has
                // a * in it
                subInfos, err := o.calcCopyInfo(rel, false)
                if err != nil {
                        return err
                }
                copyInfos = append(copyInfos, subInfos...)
                return nil
        }); err != nil {
                return nil, err
        }
        return copyInfos, nil
}

func copyInfoForFile(source builder.Source, path string) (copyInfo, error) {
        fi, err := remotecontext.StatAt(source, path)
        if err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        // return the relative path in the error, which is more user-friendly than the full path to the tmp-dir
                        return copyInfo{}, errors.WithStack(&os.PathError{Op: "stat", Path: path, Err: os.ErrNotExist})
                }
                return copyInfo{}, err
        }

        if fi.IsDir() {
                return copyInfo{}, nil
        }
        hash, err := source.Hash(path)
        if err != nil {
                return copyInfo{}, err
        }
        return newCopyInfoFromSource(source, path, "file:"+hash), nil
}

// TODO: dedupe with copyWithWildcards()
func walkSource(source builder.Source, origPath string) ([]string, error) {
        fp, err := remotecontext.FullPath(source, origPath)
        if err != nil {
                return nil, err
        }
        // Must be a dir
        var subfiles []string
        err = filepath.WalkDir(fp, func(path string, _ os.DirEntry, err error) error {
                if err != nil {
                        return err
                }
                rel, err := filepath.Rel(source.Root(), path)
                if err != nil {
                        return err
                }
                if rel == "." {
                        return nil
                }
                hash, err := source.Hash(rel)
                if err != nil {
                        return nil
                }
                // we already checked handleHash above
                subfiles = append(subfiles, hash)
                return nil
        })
        if err != nil {
                return nil, err
        }

        sort.Strings(subfiles)
        return subfiles, nil
}

type sourceDownloader func(string) (builder.Source, string, error)

func newRemoteSourceDownloader(output, stdout io.Writer) sourceDownloader {
        return func(url string) (builder.Source, string, error) {
                return downloadSource(output, stdout, url)
        }
}

func errOnSourceDownload(_ string) (builder.Source, string, error) {
        return nil, "", errors.New("source can't be a URL for COPY")
}

func getFilenameForDownload(path string, resp *http.Response) string {
        // Guess filename based on source
        if path != "" && !strings.HasSuffix(path, "/") {
                if filename := filepath.Base(filepath.FromSlash(path)); filename != "" {
                        return filename
                }
        }

        // Guess filename based on Content-Disposition
        if contentDisposition := resp.Header.Get("Content-Disposition"); contentDisposition != "" {
                if _, params, err := mime.ParseMediaType(contentDisposition); err == nil {
                        if params["filename"] != "" && !strings.HasSuffix(params["filename"], "/") {
                                if filename := filepath.Base(filepath.FromSlash(params["filename"])); filename != "" {
                                        return filename
                                }
                        }
                }
        }
        return ""
}

func downloadSource(output io.Writer, stdout io.Writer, srcURL string) (remote builder.Source, p string, retErr error) {
        u, err := url.Parse(srcURL)
        if err != nil {
                return nil, "", err
        }

        resp, err := remotecontext.GetWithStatusError(srcURL)
        if err != nil {
                return nil, "", err
        }

        filename := getFilenameForDownload(u.Path, resp)

        // Prepare file in a tmp dir
        tmpDir, err := longpath.MkdirTemp("", "docker-remote")
        if err != nil {
                return nil, "", err
        }
        defer func() {
                if retErr != nil {
                        if err := os.RemoveAll(tmpDir); err != nil {
                                log.G(context.TODO()).WithError(err).Debug("error cleaning up temp-directory after failing to download source")
                        }
                }
        }()
        // If filename is empty, the returned filename will be "" but
        // the tmp filename will be created as "__unnamed__"
        tmpFileName := filename
        if filename == "" {
                tmpFileName = unnamedFilename
        }
        tmpFileName = filepath.Join(tmpDir, tmpFileName)
        tmpFile, err := os.OpenFile(tmpFileName, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o600)
        if err != nil {
                return nil, "", err
        }
        defer func() {
                if retErr != nil {
                        // Ignore os.ErrClosed errors, as the file may already be closed in this function.
                        if err := tmpFile.Close(); err != nil && !errors.Is(err, os.ErrClosed) {
                                log.G(context.TODO()).WithError(err).Debug("error closing temp-file after failing to download source")
                        }
                }
        }()

        progressOutput := streamformatter.NewJSONProgressOutput(output, true)
        progressReader := progress.NewProgressReader(resp.Body, progressOutput, resp.ContentLength, "", "Downloading")
        // Download and dump result to tmp file
        // TODO: add filehash directly
        if _, err = io.Copy(tmpFile, progressReader); err != nil {
                return nil, "", err
        }
        // TODO: how important is this random blank line to the output?
        _, _ = fmt.Fprintln(stdout)

        // Set the mtime to the Last-Modified header value if present
        // Otherwise just remove atime and mtime
        mTime := time.Time{}

        lastMod := resp.Header.Get("Last-Modified")
        if lastMod != "" {
                // If we can't parse it then just let it default to 'zero'
                // otherwise use the parsed time value
                if parsedMTime, err := http.ParseTime(lastMod); err == nil {
                        mTime = parsedMTime
                }
        }

        // TODO(thaJeztah): was there a reason for this file to be closed _before_ system.Chtimes, or could we unconditionally close this in a defer?
        if err := tmpFile.Close(); err != nil {
                log.G(context.TODO()).WithError(err).Debug("error closing temp-file before chtimes")
        }

        if err = system.Chtimes(tmpFileName, mTime, mTime); err != nil {
                return nil, "", err
        }

        lc, err := remotecontext.NewLazySource(tmpDir)
        return lc, filename, err
}

type identity struct {
        UID int
        GID int
        SID string
}

type copyFileOptions struct {
        decompress bool
        identity   *identity
        archiver   *archive.Archiver
}

func performCopyForInfo(dest copyInfo, source copyInfo, options copyFileOptions) error {
        srcPath, err := source.fullPath()
        if err != nil {
                return err
        }

        destPath, err := dest.fullPath()
        if err != nil {
                return err
        }

        archiver := options.archiver

        src, err := os.Stat(srcPath)
        if err != nil {
                return errors.Wrapf(err, "source path not found")
        }
        if src.IsDir() {
                return copyDirectory(archiver, srcPath, destPath, options.identity)
        }
        if options.decompress && archive.IsArchivePath(srcPath) && !source.noDecompress {
                f, err := os.Open(srcPath)
                if err != nil {
                        return err
                }
                defer f.Close()
                return archiver.Untar(f, destPath, &archive.TarOptions{
                        IDMap:            archiver.IDMapping,
                        BestEffortXattrs: true,
                })
        }

        destExistsAsDir, err := isExistingDirectory(destPath)
        if err != nil {
                return err
        }
        // dest.path must be used because destPath has already been cleaned of any
        // trailing slash
        if destExistsAsDir || strings.HasSuffix(dest.path, string(os.PathSeparator)) {
                // source.path must be used to get the correct filename when the source
                // is a symlink
                destPath = filepath.Join(destPath, filepath.Base(source.path))
        }
        return copyFile(archiver, srcPath, destPath, options.identity)
}

func copyDirectory(archiver *archive.Archiver, source, dest string, identity *identity) error {
        destExists, err := isExistingDirectory(dest)
        if err != nil {
                return errors.Wrapf(err, "failed to query destination path")
        }

        if err := archiver.CopyWithTar(source, dest); err != nil {
                return errors.Wrapf(err, "failed to copy directory")
        }
        if identity != nil {
                return fixPermissions(source, dest, *identity, !destExists)
        }
        return nil
}

func copyFile(archiver *archive.Archiver, source, dest string, identity *identity) error {
        if identity == nil {
                if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
                        return err
                }
        } else {
                if err := user.MkdirAllAndChown(filepath.Dir(dest), 0o755, identity.UID, identity.GID, user.WithOnlyNew); err != nil {
                        return errors.Wrapf(err, "failed to create new directory")
                }
        }

        if err := archiver.CopyFileWithTar(source, dest); err != nil {
                return errors.Wrapf(err, "failed to copy file")
        }
        if identity != nil {
                return fixPermissions(source, dest, *identity, false)
        }
        return nil
}

// isExistingDirectory returns true if the path exists and is a directory
func isExistingDirectory(path string) (bool, error) {
        destStat, err := os.Stat(path)
        switch {
        case errors.Is(err, os.ErrNotExist):
                return false, nil
        case err != nil:
                return false, err
        }
        return destStat.IsDir(), nil
}

//go:build !windows

package dockerfile

import (
        "os"
        "path"
        "path/filepath"
        "strings"
)

func fixPermissions(source, destination string, id identity, overrideSkip bool) error {
        var (
                skipChownRoot bool
                err           error
        )
        if !overrideSkip {
                skipChownRoot, err = isExistingDirectory(destination)
                if err != nil {
                        return err
                }
        }

        // We Walk on the source rather than on the destination because we don't
        // want to change permissions on things we haven't created or modified.
        return filepath.WalkDir(source, func(fullpath string, _ os.DirEntry, _ error) error {
                // Do not alter the walk root iff. it existed before, as it doesn't fall under
                // the domain of "things we should chown".
                if skipChownRoot && source == fullpath {
                        return nil
                }

                // Path is prefixed by source: substitute with destination instead.
                cleaned, err := filepath.Rel(source, fullpath)
                if err != nil {
                        return err
                }

                fullpath = filepath.Join(destination, cleaned)
                return os.Lchown(fullpath, id.UID, id.GID)
        })
}

// normalizeDest normalises the destination of a COPY/ADD command in a
// platform semantically consistent way.
func normalizeDest(workingDir, requested string) (string, error) {
        dest := filepath.FromSlash(requested)
        endsInSlash := strings.HasSuffix(dest, string(os.PathSeparator))

        if !path.IsAbs(requested) {
                dest = path.Join("/", filepath.ToSlash(workingDir), dest)
                // Make sure we preserve any trailing slash
                if endsInSlash {
                        dest += "/"
                }
        }
        return dest, nil
}

func containsWildcards(name string) bool {
        for i := 0; i < len(name); i++ {
                ch := name[i]
                switch ch {
                case '\\':
                        i++
                case '*', '?', '[':
                        return true
                }
        }
        return false
}

func validateCopySourcePath(imageSource *imageMount, origPath string) error {
        return nil
}

package dockerfile

// This file contains the dispatchers for each command. Note that
// `nullDispatch` is not actually a command, but support for commands we parse
// but do nothing with.
//
// See evaluator.go for a higher level discussion of the whole evaluator
// package.

import (
        "bytes"
        "context"
        "fmt"
        "runtime"
        "sort"
        "strings"

        "github.com/containerd/platforms"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/image"
        "github.com/docker/docker/pkg/jsonmessage"
        "github.com/docker/go-connections/nat"
        "github.com/moby/buildkit/frontend/dockerfile/instructions"
        "github.com/moby/buildkit/frontend/dockerfile/parser"
        "github.com/moby/buildkit/frontend/dockerfile/shell"
        "github.com/moby/sys/signal"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// noBaseImageSpecifier is the symbol used by the FROM
// command to specify that no base image is to be used.
const noBaseImageSpecifier = "scratch"

// ENV foo bar
//
// Sets the environment variable foo to bar, also makes interpolation
// in the dockerfile available from the next statement on via ${foo}.
func dispatchEnv(ctx context.Context, d dispatchRequest, c *instructions.EnvCommand) error {
        runConfig := d.state.runConfig
        commitMessage := bytes.NewBufferString("ENV")
        for _, e := range c.Env {
                name := e.Key
                newVar := e.String()

                commitMessage.WriteString(" " + newVar)
                gotOne := false
                for i, envVar := range runConfig.Env {
                        compareFrom, _, _ := strings.Cut(envVar, "=")
                        if shell.EqualEnvKeys(compareFrom, name) {
                                runConfig.Env[i] = newVar
                                gotOne = true
                                break
                        }
                }
                if !gotOne {
                        runConfig.Env = append(runConfig.Env, newVar)
                }
        }
        return d.builder.commit(ctx, d.state, commitMessage.String())
}

// MAINTAINER some text <maybe@an.email.address>
//
// Sets the maintainer metadata.
func dispatchMaintainer(ctx context.Context, d dispatchRequest, c *instructions.MaintainerCommand) error {
        d.state.maintainer = c.Maintainer
        return d.builder.commit(ctx, d.state, "MAINTAINER "+c.Maintainer)
}

// LABEL some json data describing the image
//
// Sets the Label variable foo to bar,
func dispatchLabel(ctx context.Context, d dispatchRequest, c *instructions.LabelCommand) error {
        if d.state.runConfig.Labels == nil {
                d.state.runConfig.Labels = make(map[string]string)
        }
        commitStr := "LABEL"
        for _, v := range c.Labels {
                d.state.runConfig.Labels[v.Key] = v.Value
                commitStr += " " + v.String()
        }
        return d.builder.commit(ctx, d.state, commitStr)
}

// ADD foo /path
//
// Add the file 'foo' to '/path'. Tarball and Remote URL (http, https) handling
// exist here. If you do not wish to have this automatic handling, use COPY.
func dispatchAdd(ctx context.Context, d dispatchRequest, c *instructions.AddCommand) error {
        if c.Chmod != "" {
                return errors.New("the --chmod option requires BuildKit. Refer to https://docs.docker.com/go/buildkit/ to learn how to build images with BuildKit enabled")
        }
        downloader := newRemoteSourceDownloader(d.builder.Output, d.builder.Stdout)
        cpr := copierFromDispatchRequest(d, downloader, nil)
        defer cpr.Cleanup()

        instruction, err := cpr.createCopyInstruction(c.SourcesAndDest, "ADD")
        if err != nil {
                return err
        }
        instruction.chownStr = c.Chown
        instruction.allowLocalDecompression = true

        return d.builder.performCopy(ctx, d, instruction)
}

// COPY foo /path
//
// Same as 'ADD' but without the tar and remote url handling.
func dispatchCopy(ctx context.Context, d dispatchRequest, c *instructions.CopyCommand) error {
        if c.Chmod != "" {
                return errors.New("the --chmod option requires BuildKit. Refer to https://docs.docker.com/go/buildkit/ to learn how to build images with BuildKit enabled")
        }
        var im *imageMount
        var err error
        if c.From != "" {
                im, err = d.getImageMount(ctx, c.From)
                if err != nil {
                        return errors.Wrapf(err, "invalid from flag value %s", c.From)
                }
        }
        cpr := copierFromDispatchRequest(d, errOnSourceDownload, im)
        defer cpr.Cleanup()
        instruction, err := cpr.createCopyInstruction(c.SourcesAndDest, "COPY")
        if err != nil {
                return err
        }
        instruction.chownStr = c.Chown
        if c.From != "" && instruction.chownStr == "" {
                instruction.preserveOwnership = true
        }
        return d.builder.performCopy(ctx, d, instruction)
}

func (d *dispatchRequest) getImageMount(ctx context.Context, imageRefOrID string) (*imageMount, error) {
        if imageRefOrID == "" {
                // TODO: this could return the source in the default case as well?
                return nil, nil
        }

        var localOnly bool
        stage, err := d.stages.get(imageRefOrID)
        if err != nil {
                return nil, err
        }
        if stage != nil {
                imageRefOrID = stage.Image
                localOnly = true
        }
        return d.builder.imageSources.Get(ctx, imageRefOrID, localOnly, d.builder.platform)
}

// FROM [--platform=platform] imagename[:tag | @digest] [AS build-stage-name]
func initializeStage(ctx context.Context, d dispatchRequest, cmd *instructions.Stage) error {
        err := d.builder.imageProber.Reset(ctx)
        if err != nil {
                return err
        }

        var platform *ocispec.Platform
        if val := cmd.Platform; val != "" {
                v, err := d.getExpandedString(d.shlex, val)
                if err != nil {
                        return errors.Wrapf(err, "failed to process arguments for platform %s", v)
                }

                p, err := platforms.Parse(v)
                if err != nil {
                        return errors.Wrapf(errdefs.InvalidParameter(err), "failed to parse platform %s", v)
                }
                platform = &p
        }

        img, err := d.getFromImage(ctx, d.shlex, cmd.BaseName, platform)
        if err != nil {
                return err
        }
        state := d.state
        if err := state.beginStage(cmd.Name, img); err != nil {
                return err
        }
        if len(state.runConfig.OnBuild) > 0 {
                triggers := state.runConfig.OnBuild
                state.runConfig.OnBuild = nil
                return dispatchTriggeredOnBuild(ctx, d, triggers)
        }
        return nil
}

func dispatchTriggeredOnBuild(ctx context.Context, d dispatchRequest, triggers []string) error {
        fmt.Fprintf(d.builder.Stdout, "# Executing %d build trigger", len(triggers))
        if len(triggers) > 1 {
                fmt.Fprint(d.builder.Stdout, "s")
        }
        fmt.Fprintln(d.builder.Stdout)
        for _, trigger := range triggers {
                d.state.updateRunConfig()
                ast, err := parser.Parse(strings.NewReader(trigger))
                if err != nil {
                        return err
                }
                if len(ast.AST.Children) != 1 {
                        return errors.New("onbuild trigger should be a single expression")
                }
                cmd, err := instructions.ParseCommand(ast.AST.Children[0])
                if err != nil {
                        var uiErr *instructions.UnknownInstructionError
                        if errors.As(err, &uiErr) {
                                buildsFailed.WithValues(metricsUnknownInstructionError).Inc()
                        }
                        return err
                }
                err = dispatch(ctx, d, cmd)
                if err != nil {
                        return err
                }
        }
        return nil
}

func (d *dispatchRequest) getExpandedString(shlex *shell.Lex, str string) (string, error) {
        substitutionArgs := []string{}
        for key, value := range d.state.buildArgs.GetAllMeta() {
                substitutionArgs = append(substitutionArgs, key+"="+value)
        }

        name, _, err := shlex.ProcessWord(str, shell.EnvsFromSlice(substitutionArgs))
        if err != nil {
                return "", err
        }
        return name, nil
}

func (d *dispatchRequest) getImageOrStage(ctx context.Context, name string, platform *ocispec.Platform) (builder.Image, error) {
        var localOnly bool
        if im, ok := d.stages.getByName(name); ok {
                name = im.Image
                localOnly = true
        }

        if platform == nil {
                platform = d.builder.platform
        }

        // Windows cannot support a container with no base image.
        if name == noBaseImageSpecifier {
                // Windows supports scratch. What is not supported is running containers from it.
                if runtime.GOOS == "windows" {
                        return nil, errors.New("Windows does not support FROM scratch")
                }

                // TODO: scratch should not have an os. It should be nil image.
                imageImage := &image.Image{}
                if platform != nil {
                        imageImage.OS = platform.OS
                } else {
                        imageImage.OS = runtime.GOOS
                }
                return builder.Image(imageImage), nil
        }
        imgMount, err := d.builder.imageSources.Get(ctx, name, localOnly, platform)
        if err != nil {
                return nil, err
        }
        return imgMount.Image(), nil
}

func (d *dispatchRequest) getFromImage(ctx context.Context, shlex *shell.Lex, basename string, platform *ocispec.Platform) (builder.Image, error) {
        name, err := d.getExpandedString(shlex, basename)
        if err != nil {
                return nil, err
        }
        // Empty string is interpreted to FROM scratch by images.GetImageAndReleasableLayer,
        // so validate expanded result is not empty.
        if name == "" {
                return nil, errors.Errorf("base name (%s) should not be blank", basename)
        }

        return d.getImageOrStage(ctx, name, platform)
}

func dispatchOnbuild(ctx context.Context, d dispatchRequest, c *instructions.OnbuildCommand) error {
        d.state.runConfig.OnBuild = append(d.state.runConfig.OnBuild, c.Expression)
        return d.builder.commit(ctx, d.state, "ONBUILD "+c.Expression)
}

// WORKDIR /tmp
//
// Set the working directory for future RUN/CMD/etc statements.
func dispatchWorkdir(ctx context.Context, d dispatchRequest, c *instructions.WorkdirCommand) error {
        runConfig := d.state.runConfig
        var err error
        runConfig.WorkingDir, err = normalizeWorkdir(d.state.operatingSystem, runConfig.WorkingDir, c.Path)
        if err != nil {
                return err
        }

        // For performance reasons, we explicitly do a create/mkdir now
        // This avoids having an unnecessary expensive mount/unmount calls
        // (on Windows in particular) during each container create.
        // Prior to 1.13, the mkdir was deferred and not executed at this step.
        if d.builder.disableCommit {
                // Don't call back into the daemon if we're going through docker commit --change "WORKDIR /foo".
                // We've already updated the runConfig and that's enough.
                return nil
        }

        comment := "WORKDIR " + runConfig.WorkingDir
        runConfigWithCommentCmd := copyRunConfig(runConfig, withCmdCommentString(comment, d.state.operatingSystem))

        containerID, err := d.builder.probeAndCreate(ctx, d.state, runConfigWithCommentCmd)
        if err != nil || containerID == "" {
                return err
        }

        if err := d.builder.docker.ContainerCreateWorkdir(containerID); err != nil {
                return err
        }

        return d.builder.commitContainer(ctx, d.state, containerID, runConfigWithCommentCmd)
}

// RUN some command yo
//
// run a command and commit the image. Args are automatically prepended with
// the current SHELL which defaults to 'sh -c' under linux or 'cmd /S /C' under
// Windows, in the event there is only one argument The difference in processing:
//
// RUN echo hi          # sh -c echo hi       (Linux and LCOW)
// RUN echo hi          # cmd /S /C echo hi   (Windows)
// RUN [ "echo", "hi" ] # echo hi
func dispatchRun(ctx context.Context, d dispatchRequest, c *instructions.RunCommand) error {
        if err := image.CheckOS(d.state.operatingSystem); err != nil {
                return err
        }

        if len(c.FlagsUsed) > 0 {
                // classic builder RUN currently does not support any flags, so fail on the first one
                return errors.Errorf("the --%s option requires BuildKit. Refer to https://docs.docker.com/go/buildkit/ to learn how to build images with BuildKit enabled", c.FlagsUsed[0])
        }

        stateRunConfig := d.state.runConfig
        cmdFromArgs, argsEscaped := resolveCmdLine(c.ShellDependantCmdLine, stateRunConfig, d.state.operatingSystem, c.Name(), c.String())
        buildArgs := d.state.buildArgs.FilterAllowed(stateRunConfig.Env)

        saveCmd := cmdFromArgs
        if len(buildArgs) > 0 {
                saveCmd = prependEnvOnCmd(d.state.buildArgs, buildArgs, cmdFromArgs)
        }

        cacheArgsEscaped := argsEscaped
        // ArgsEscaped is not persisted in the committed image on Windows.
        // Use the original from previous build steps for cache probing.
        if d.state.operatingSystem == "windows" {
                cacheArgsEscaped = stateRunConfig.ArgsEscaped
        }

        runConfigForCacheProbe := copyRunConfig(stateRunConfig,
                withCmd(saveCmd),
                withArgsEscaped(cacheArgsEscaped),
                withEntrypointOverride(saveCmd, nil))
        if hit, err := d.builder.probeCache(d.state, runConfigForCacheProbe); err != nil || hit {
                return err
        }

        runConfig := copyRunConfig(stateRunConfig,
                withCmd(cmdFromArgs),
                withArgsEscaped(argsEscaped),
                withEnv(append(stateRunConfig.Env, buildArgs...)),
                withEntrypointOverride(saveCmd, []string{""}),
                withoutHealthcheck())

        cID, err := d.builder.create(ctx, runConfig)
        if err != nil {
                return err
        }

        if err := d.builder.containerManager.Run(ctx, cID, d.builder.Stdout, d.builder.Stderr); err != nil {
                if err, ok := err.(*statusCodeError); ok {
                        // TODO: change error type, because jsonmessage.JSONError assumes HTTP
                        msg := fmt.Sprintf(
                                "The command '%s' returned a non-zero code: %d",
                                strings.Join(runConfig.Cmd, " "), err.StatusCode())
                        if err.Error() != "" {
                                msg = fmt.Sprintf("%s: %s", msg, err.Error())
                        }
                        return &jsonmessage.JSONError{
                                Message: msg,
                                Code:    err.StatusCode(),
                        }
                }
                return err
        }

        // Don't persist the argsEscaped value in the committed image. Use the original
        // from previous build steps (only CMD and ENTRYPOINT persist this).
        if d.state.operatingSystem == "windows" {
                runConfigForCacheProbe.ArgsEscaped = stateRunConfig.ArgsEscaped
        }

        return d.builder.commitContainer(ctx, d.state, cID, runConfigForCacheProbe)
}

// Derive the command to use for probeCache() and to commit in this container.
// Note that we only do this if there are any build-time env vars.  Also, we
// use the special argument "|#" at the start of the args array. This will
// avoid conflicts with any RUN command since commands can not
// start with | (vertical bar). The "#" (number of build envs) is there to
// help ensure proper cache matches. We don't want a RUN command
// that starts with "foo=abc" to be considered part of a build-time env var.
//
// remove any unreferenced built-in args from the environment variables.
// These args are transparent so resulting image should be the same regardless
// of the value.
func prependEnvOnCmd(buildArgs *BuildArgs, buildArgVars []string, cmd []string) []string {
        tmpBuildEnv := make([]string, 0, len(buildArgVars))
        for _, env := range buildArgVars {
                key, _, _ := strings.Cut(env, "=")
                if buildArgs.IsReferencedOrNotBuiltin(key) {
                        tmpBuildEnv = append(tmpBuildEnv, env)
                }
        }

        sort.Strings(tmpBuildEnv)
        tmpEnv := append([]string{fmt.Sprintf("|%d", len(tmpBuildEnv))}, tmpBuildEnv...)
        return append(tmpEnv, cmd...)
}

// CMD foo
//
// Set the default command to run in the container (which may be empty).
// Argument handling is the same as RUN.
func dispatchCmd(ctx context.Context, d dispatchRequest, c *instructions.CmdCommand) error {
        runConfig := d.state.runConfig
        cmd, argsEscaped := resolveCmdLine(c.ShellDependantCmdLine, runConfig, d.state.operatingSystem, c.Name(), c.String())

        // We warn here as Windows shell processing operates differently to Linux.
        // Linux:   /bin/sh -c "echo hello" world        --> hello
        // Windows: cmd /s /c "echo hello" world        --> hello world
        if d.state.operatingSystem == "windows" &&
                len(runConfig.Entrypoint) > 0 &&
                d.state.runConfig.ArgsEscaped != argsEscaped {
                fmt.Fprintf(d.builder.Stderr, " ---> [Warning] Shell-form ENTRYPOINT and exec-form CMD may have unexpected results\n")
        }

        runConfig.Cmd = cmd
        runConfig.ArgsEscaped = argsEscaped

        if err := d.builder.commit(ctx, d.state, fmt.Sprintf("CMD %q", cmd)); err != nil {
                return err
        }
        if len(c.ShellDependantCmdLine.CmdLine) != 0 {
                d.state.cmdSet = true
        }

        return nil
}

// HEALTHCHECK foo
//
// Set the default healthcheck command to run in the container (which may be empty).
// Argument handling is the same as RUN.
func dispatchHealthcheck(ctx context.Context, d dispatchRequest, c *instructions.HealthCheckCommand) error {
        runConfig := d.state.runConfig
        if runConfig.Healthcheck != nil {
                oldCmd := runConfig.Healthcheck.Test
                if len(oldCmd) > 0 && oldCmd[0] != "NONE" {
                        fmt.Fprintf(d.builder.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd)
                }
        }
        runConfig.Healthcheck = c.Health
        return d.builder.commit(ctx, d.state, fmt.Sprintf("HEALTHCHECK %q", runConfig.Healthcheck))
}

// ENTRYPOINT /usr/sbin/nginx
//
// Set the entrypoint to /usr/sbin/nginx. Will accept the CMD as the arguments
// to /usr/sbin/nginx. Uses the default shell if not in JSON format.
//
// Handles command processing similar to CMD and RUN, only req.runConfig.Entrypoint
// is initialized at newBuilder time instead of through argument parsing.
func dispatchEntrypoint(ctx context.Context, d dispatchRequest, c *instructions.EntrypointCommand) error {
        runConfig := d.state.runConfig
        cmd, argsEscaped := resolveCmdLine(c.ShellDependantCmdLine, runConfig, d.state.operatingSystem, c.Name(), c.String())

        // This warning is a little more complex than in dispatchCmd(), as the Windows base images (similar
        // universally to almost every Linux image out there) have a single .Cmd field populated so that
        // `docker run --rm image` starts the default shell which would typically be sh on Linux,
        // or cmd on Windows. The catch to this is that if a dockerfile had `CMD ["c:\\windows\\system32\\cmd.exe"]`,
        // we wouldn't be able to tell the difference. However, that would be highly unlikely, and besides, this
        // is only trying to give a helpful warning of possibly unexpected results.
        if d.state.operatingSystem == "windows" &&
                d.state.runConfig.ArgsEscaped != argsEscaped &&
                ((len(runConfig.Cmd) == 1 && strings.ToLower(runConfig.Cmd[0]) != `c:\windows\system32\cmd.exe` && len(runConfig.Shell) == 0) || (len(runConfig.Cmd) > 1)) {
                fmt.Fprintf(d.builder.Stderr, " ---> [Warning] Shell-form CMD and exec-form ENTRYPOINT may have unexpected results\n")
        }

        runConfig.Entrypoint = cmd
        runConfig.ArgsEscaped = argsEscaped
        if !d.state.cmdSet {
                runConfig.Cmd = nil
        }

        return d.builder.commit(ctx, d.state, fmt.Sprintf("ENTRYPOINT %q", runConfig.Entrypoint))
}

// EXPOSE 6667/tcp 7000/tcp
//
// Expose ports for links and port mappings. This all ends up in
// req.runConfig.ExposedPorts for runconfig.
func dispatchExpose(ctx context.Context, d dispatchRequest, c *instructions.ExposeCommand, envs shell.EnvGetter) error {
        // custom multi word expansion
        // expose $FOO with FOO="80 443" is expanded as EXPOSE [80,443]. This is the only command supporting word to words expansion
        // so the word processing has been de-generalized
        ports := []string{}
        for _, p := range c.Ports {
                ps, err := d.shlex.ProcessWords(p, envs)
                if err != nil {
                        return err
                }
                ports = append(ports, ps...)
        }
        c.Ports = ports

        ps, _, err := nat.ParsePortSpecs(ports)
        if err != nil {
                return err
        }

        if d.state.runConfig.ExposedPorts == nil {
                d.state.runConfig.ExposedPorts = make(nat.PortSet)
        }
        for p := range ps {
                d.state.runConfig.ExposedPorts[p] = struct{}{}
        }

        return d.builder.commit(ctx, d.state, "EXPOSE "+strings.Join(c.Ports, " "))
}

// USER foo
//
// Set the user to 'foo' for future commands and when running the
// ENTRYPOINT/CMD at container run time.
func dispatchUser(ctx context.Context, d dispatchRequest, c *instructions.UserCommand) error {
        d.state.runConfig.User = c.User
        return d.builder.commit(ctx, d.state, fmt.Sprintf("USER %v", c.User))
}

// VOLUME /foo
//
// Expose the volume /foo for use. Will also accept the JSON array form.
func dispatchVolume(ctx context.Context, d dispatchRequest, c *instructions.VolumeCommand) error {
        if d.state.runConfig.Volumes == nil {
                d.state.runConfig.Volumes = map[string]struct{}{}
        }
        for _, v := range c.Volumes {
                if v == "" {
                        return errors.New("VOLUME specified can not be an empty string")
                }
                d.state.runConfig.Volumes[v] = struct{}{}
        }
        return d.builder.commit(ctx, d.state, fmt.Sprintf("VOLUME %v", c.Volumes))
}

// STOPSIGNAL signal
//
// Set the signal that will be used to kill the container.
func dispatchStopSignal(ctx context.Context, d dispatchRequest, c *instructions.StopSignalCommand) error {
        _, err := signal.ParseSignal(c.Signal)
        if err != nil {
                return errdefs.InvalidParameter(err)
        }
        d.state.runConfig.StopSignal = c.Signal
        return d.builder.commit(ctx, d.state, fmt.Sprintf("STOPSIGNAL %v", c.Signal))
}

// ARG name[=value]
//
// Adds the variable foo to the trusted list of variables that can be passed
// to builder using the --build-arg flag for expansion/substitution or passing to 'run'.
// Dockerfile author may optionally set a default value of this variable.
func dispatchArg(ctx context.Context, d dispatchRequest, c *instructions.ArgCommand) error {
        var commitStr strings.Builder
        commitStr.WriteString("ARG ")
        for i, arg := range c.Args {
                if i > 0 {
                        commitStr.WriteString(" ")
                }
                commitStr.WriteString(arg.Key)
                if arg.Value != nil {
                        commitStr.WriteString("=")
                        commitStr.WriteString(*arg.Value)
                }
                d.state.buildArgs.AddArg(arg.Key, arg.Value)
        }

        return d.builder.commit(ctx, d.state, commitStr.String())
}

// SHELL powershell -command
//
// Set the non-default shell to use.
func dispatchShell(ctx context.Context, d dispatchRequest, c *instructions.ShellCommand) error {
        d.state.runConfig.Shell = c.Shell
        return d.builder.commit(ctx, d.state, fmt.Sprintf("SHELL %v", d.state.runConfig.Shell))
}

//go:build !windows

package dockerfile

import (
        "errors"
        "os"
        "path/filepath"

        "github.com/moby/buildkit/frontend/dockerfile/instructions"
        "github.com/moby/moby/api/types/container"
)

// normalizeWorkdir normalizes a user requested working directory in a
// platform semantically consistent way.
func normalizeWorkdir(_ string, current string, requested string) (string, error) {
        if requested == "" {
                return "", errors.New("cannot normalize nothing")
        }
        current = filepath.FromSlash(current)
        requested = filepath.FromSlash(requested)
        if !filepath.IsAbs(requested) {
                return filepath.Join(string(os.PathSeparator), current, requested), nil
        }
        return filepath.Clean(requested), nil
}

// resolveCmdLine takes a command line arg set and optionally prepends a platform-specific
// shell in front of it.
func resolveCmdLine(cmd instructions.ShellDependantCmdLine, runConfig *container.Config, os, _, _ string) ([]string, bool) {
        result := cmd.CmdLine
        if cmd.PrependShell && result != nil {
                result = append(getShell(runConfig, os), result...)
        }
        return result, false
}

// Package dockerfile is the evaluation step in the Dockerfile parse/evaluate pipeline.
//
// It incorporates a dispatch table based on the parser.Node values (see the
// parser package for more information) that are yielded from the parser itself.
// Calling newBuilder with the BuildOpts struct can be used to customize the
// experience for execution purposes only. Parsing is controlled in the parser
// package, and this division of responsibility should be respected.
//
// Please see the jump table targets for the actual invocations, most of which
// will call out to the functions in internals.go to deal with their tasks.
//
// ONBUILD is a special case, which is covered in the onbuild() func in
// dispatchers.go.
//
// The evaluator uses the concept of "steps", which are usually each processable
// line in the Dockerfile. Each step is numbered and certain actions are taken
// before and after each step, such as creating an image ID and removing temporary
// containers and images. Note that ONBUILD creates a kinda-sorta "sub run" which
// includes its own set of steps (usually only one of them).
package dockerfile

import (
        "context"
        "reflect"
        "strconv"
        "strings"

        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/image"
        "github.com/docker/docker/oci"
        "github.com/moby/buildkit/frontend/dockerfile/instructions"
        "github.com/moby/buildkit/frontend/dockerfile/shell"
        "github.com/moby/moby/api/types/container"
        "github.com/pkg/errors"
)

func dispatch(ctx context.Context, d dispatchRequest, cmd instructions.Command) (retErr error) {
        if c, ok := cmd.(instructions.PlatformSpecific); ok {
                err := c.CheckPlatform(d.state.operatingSystem)
                if err != nil {
                        return errdefs.InvalidParameter(err)
                }
        }
        runConfigEnv := d.state.runConfig.Env
        envs := shell.EnvsFromSlice(append(runConfigEnv, d.state.buildArgs.FilterAllowed(runConfigEnv)...))

        if ex, ok := cmd.(instructions.SupportsSingleWordExpansion); ok {
                err := ex.Expand(func(word string) (string, error) {
                        newword, _, err := d.shlex.ProcessWord(word, envs)
                        return newword, err
                })
                if err != nil {
                        return errdefs.InvalidParameter(err)
                }
        }

        defer func() {
                if d.builder.options.ForceRemove {
                        d.builder.containerManager.RemoveAll(d.builder.Stdout)
                        return
                }
                if d.builder.options.Remove && retErr == nil {
                        d.builder.containerManager.RemoveAll(d.builder.Stdout)
                        return
                }
        }()
        switch c := cmd.(type) {
        case *instructions.EnvCommand:
                return dispatchEnv(ctx, d, c)
        case *instructions.MaintainerCommand:
                return dispatchMaintainer(ctx, d, c)
        case *instructions.LabelCommand:
                return dispatchLabel(ctx, d, c)
        case *instructions.AddCommand:
                return dispatchAdd(ctx, d, c)
        case *instructions.CopyCommand:
                return dispatchCopy(ctx, d, c)
        case *instructions.OnbuildCommand:
                return dispatchOnbuild(ctx, d, c)
        case *instructions.WorkdirCommand:
                return dispatchWorkdir(ctx, d, c)
        case *instructions.RunCommand:
                return dispatchRun(ctx, d, c)
        case *instructions.CmdCommand:
                return dispatchCmd(ctx, d, c)
        case *instructions.HealthCheckCommand:
                return dispatchHealthcheck(ctx, d, c)
        case *instructions.EntrypointCommand:
                return dispatchEntrypoint(ctx, d, c)
        case *instructions.ExposeCommand:
                return dispatchExpose(ctx, d, c, envs)
        case *instructions.UserCommand:
                return dispatchUser(ctx, d, c)
        case *instructions.VolumeCommand:
                return dispatchVolume(ctx, d, c)
        case *instructions.StopSignalCommand:
                return dispatchStopSignal(ctx, d, c)
        case *instructions.ArgCommand:
                return dispatchArg(ctx, d, c)
        case *instructions.ShellCommand:
                return dispatchShell(ctx, d, c)
        }
        return errors.Errorf("unsupported command type: %v", reflect.TypeOf(cmd))
}

// dispatchState is a data object which is modified by dispatchers
type dispatchState struct {
        runConfig       *container.Config
        maintainer      string
        cmdSet          bool
        imageID         string
        baseImage       builder.Image
        stageName       string
        buildArgs       *BuildArgs
        operatingSystem string
}

func newDispatchState(baseArgs *BuildArgs) *dispatchState {
        args := baseArgs.Clone()
        args.ResetAllowed()
        return &dispatchState{runConfig: &container.Config{}, buildArgs: args}
}

type stagesBuildResults struct {
        flat    []*container.Config
        indexed map[string]*container.Config
}

func newStagesBuildResults() *stagesBuildResults {
        return &stagesBuildResults{
                indexed: make(map[string]*container.Config),
        }
}

func (r *stagesBuildResults) getByName(name string) (*container.Config, bool) {
        c, ok := r.indexed[strings.ToLower(name)]
        return c, ok
}

func (r *stagesBuildResults) validateIndex(i int) error {
        if i == len(r.flat) {
                return errors.New("refers to current build stage")
        }
        if i < 0 || i > len(r.flat) {
                return errors.New("index out of bounds")
        }
        return nil
}

func (r *stagesBuildResults) get(nameOrIndex string) (*container.Config, error) {
        if c, ok := r.getByName(nameOrIndex); ok {
                return c, nil
        }
        ix, err := strconv.ParseInt(nameOrIndex, 10, 0)
        if err != nil {
                return nil, nil
        }
        if err := r.validateIndex(int(ix)); err != nil {
                return nil, err
        }
        return r.flat[ix], nil
}

func (r *stagesBuildResults) checkStageNameAvailable(name string) error {
        if name != "" {
                if _, ok := r.getByName(name); ok {
                        return errors.Errorf("%s stage name already used", name)
                }
        }
        return nil
}

func (r *stagesBuildResults) commitStage(name string, config *container.Config) error {
        if name != "" {
                if _, ok := r.getByName(name); ok {
                        return errors.Errorf("%s stage name already used", name)
                }
                r.indexed[strings.ToLower(name)] = config
        }
        r.flat = append(r.flat, config)
        return nil
}

func commitStage(state *dispatchState, stages *stagesBuildResults) error {
        return stages.commitStage(state.stageName, state.runConfig)
}

type dispatchRequest struct {
        state   *dispatchState
        shlex   *shell.Lex
        builder *Builder
        source  builder.Source
        stages  *stagesBuildResults
}

func newDispatchRequest(builder *Builder, escapeToken rune, source builder.Source, buildArgs *BuildArgs, stages *stagesBuildResults) dispatchRequest {
        return dispatchRequest{
                state:   newDispatchState(buildArgs),
                shlex:   shell.NewLex(escapeToken),
                builder: builder,
                source:  source,
                stages:  stages,
        }
}

func (s *dispatchState) updateRunConfig() {
        s.runConfig.Image = s.imageID
}

// hasFromImage returns true if the builder has processed a `FROM <image>` line
func (s *dispatchState) hasFromImage() bool {
        return s.imageID != "" || (s.baseImage != nil && s.baseImage.ImageID() == "")
}

func (s *dispatchState) beginStage(stageName string, img builder.Image) error {
        s.stageName = stageName
        s.imageID = img.ImageID()
        s.operatingSystem = img.OperatingSystem()
        if err := image.CheckOS(s.operatingSystem); err != nil {
                return err
        }

        if img.RunConfig() != nil {
                // copy avoids referencing the same instance when 2 stages have the same base
                s.runConfig = copyRunConfig(img.RunConfig())
        } else {
                s.runConfig = &container.Config{}
        }
        s.baseImage = img
        s.setDefaultPath()
        s.runConfig.OpenStdin = false
        s.runConfig.StdinOnce = false
        return nil
}

// Add the default PATH to runConfig.ENV if one exists for the operating system and there
// is no PATH set. Note that Windows containers on Windows won't have one as it's set by HCS
func (s *dispatchState) setDefaultPath() {
        // TODO(thaJeztah): use github.com/moby/buildkit/util/system.DefaultPathEnv() once https://github.com/moby/buildkit/pull/3158 is resolved.
        defaultPath := oci.DefaultPathEnv(s.operatingSystem)
        if defaultPath == "" {
                return
        }
        envMap := convertKVStringsToMap(s.runConfig.Env)
        if _, ok := envMap["PATH"]; !ok {
                s.runConfig.Env = append(s.runConfig.Env, "PATH="+defaultPath)
        }
}

package dockerfile

import (
        "context"
        "runtime"

        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/docker/docker/daemon/builder"
        dockerimage "github.com/docker/docker/image"
        "github.com/moby/moby/api/types/backend"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

type getAndMountFunc func(context.Context, string, bool, *ocispec.Platform) (builder.Image, builder.ROLayer, error)

// imageSources mounts images and provides a cache for mounted images. It tracks
// all images so they can be unmounted at the end of the build.
type imageSources struct {
        byImageID map[string]*imageMount
        mounts    []*imageMount
        getImage  getAndMountFunc
}

func newImageSources(options builderOptions) *imageSources {
        getAndMount := func(ctx context.Context, idOrRef string, localOnly bool, platform *ocispec.Platform) (builder.Image, builder.ROLayer, error) {
                pullOption := backend.PullOptionNoPull
                if !localOnly {
                        if options.Options.PullParent {
                                pullOption = backend.PullOptionForcePull
                        } else {
                                pullOption = backend.PullOptionPreferLocal
                        }
                }
                return options.Backend.GetImageAndReleasableLayer(ctx, idOrRef, backend.GetImageAndLayerOptions{
                        PullOption: pullOption,
                        AuthConfig: options.Options.AuthConfigs,
                        Output:     options.ProgressWriter.Output,
                        Platform:   platform,
                })
        }

        return &imageSources{
                byImageID: make(map[string]*imageMount),
                getImage:  getAndMount,
        }
}

func (m *imageSources) Get(ctx context.Context, idOrRef string, localOnly bool, platform *ocispec.Platform) (*imageMount, error) {
        if im, ok := m.byImageID[idOrRef]; ok {
                return im, nil
        }

        image, layer, err := m.getImage(ctx, idOrRef, localOnly, platform)
        if err != nil {
                return nil, err
        }
        im := newImageMount(image, layer)
        m.Add(im, platform)
        return im, nil
}

func (m *imageSources) Unmount() (retErr error) {
        for _, im := range m.mounts {
                if err := im.unmount(); err != nil {
                        log.G(context.TODO()).Error(err)
                        retErr = err
                }
        }
        return retErr
}

func (m *imageSources) Add(im *imageMount, platform *ocispec.Platform) {
        switch im.image {
        case nil:
                // Set the platform for scratch images
                if platform == nil {
                        p := platforms.DefaultSpec()
                        platform = &p
                }

                // Windows does not support scratch except for LCOW
                os := platform.OS
                if runtime.GOOS == "windows" {
                        os = "linux"
                }

                im.image = &dockerimage.Image{V1Image: dockerimage.V1Image{
                        OS:           os,
                        Architecture: platform.Architecture,
                        Variant:      platform.Variant,
                }}
        default:
                m.byImageID[im.image.ImageID()] = im
        }
        m.mounts = append(m.mounts, im)
}

// imageMount is a reference to an image that can be used as a builder.Source
type imageMount struct {
        image builder.Image
        layer builder.ROLayer
}

func newImageMount(image builder.Image, layer builder.ROLayer) *imageMount {
        im := &imageMount{image: image, layer: layer}
        return im
}

func (im *imageMount) unmount() error {
        if im.layer == nil {
                return nil
        }
        if err := im.layer.Release(); err != nil {
                return errors.Wrapf(err, "failed to unmount previous build image %s", im.image.ImageID())
        }
        im.layer = nil
        return nil
}

func (im *imageMount) Image() builder.Image {
        return im.image
}

func (im *imageMount) NewRWLayer() (builder.RWLayer, error) {
        return im.layer.NewRWLayer()
}

func (im *imageMount) ImageID() string {
        return im.image.ImageID()
}

package dockerfile

import (
        "context"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/builder"
        "github.com/moby/moby/api/types/container"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// ImageProber exposes an Image cache to the Builder. It supports resetting a
// cache.
type ImageProber interface {
        Reset(ctx context.Context) error
        Probe(parentID string, runConfig *container.Config, platform ocispec.Platform) (string, error)
}

type resetFunc func(context.Context) (builder.ImageCache, error)

type imageProber struct {
        cache       builder.ImageCache
        reset       resetFunc
        cacheBusted bool
}

func newImageProber(ctx context.Context, cacheBuilder builder.ImageCacheBuilder, cacheFrom []string, noCache bool) (ImageProber, error) {
        if noCache {
                return &nopProber{}, nil
        }

        reset := func(ctx context.Context) (builder.ImageCache, error) {
                return cacheBuilder.MakeImageCache(ctx, cacheFrom)
        }

        cache, err := reset(ctx)
        if err != nil {
                return nil, err
        }
        return &imageProber{cache: cache, reset: reset}, nil
}

func (c *imageProber) Reset(ctx context.Context) error {
        newCache, err := c.reset(ctx)
        if err != nil {
                return err
        }
        c.cache = newCache
        c.cacheBusted = false
        return nil
}

// Probe checks if cache match can be found for current build instruction.
// It returns the cachedID if there is a hit, and the empty string on miss
func (c *imageProber) Probe(parentID string, runConfig *container.Config, platform ocispec.Platform) (string, error) {
        if c.cacheBusted {
                return "", nil
        }
        cacheID, err := c.cache.GetCache(parentID, runConfig, platform)
        if err != nil {
                return "", err
        }
        if cacheID == "" {
                log.G(context.TODO()).Debugf("[BUILDER] Cache miss: %s", runConfig.Cmd)
                c.cacheBusted = true
                return "", nil
        }
        log.G(context.TODO()).Debugf("[BUILDER] Use cached version: %s", runConfig.Cmd)
        return cacheID, nil
}

type nopProber struct{}

func (c *nopProber) Reset(ctx context.Context) error {
        return nil
}

func (c *nopProber) Probe(_ string, _ *container.Config, _ ocispec.Platform) (string, error) {
        return "", nil
}

package dockerfile

// internals for handling commands. Covers many areas and a lot of
// non-contiguous functionality. Please read the comments.

import (
        "context"
        "crypto/sha256"
        "encoding/hex"
        "fmt"
        "strings"

        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/docker/docker/daemon/builder"
        networkSettings "github.com/docker/docker/daemon/network"
        "github.com/docker/docker/image"
        "github.com/docker/docker/pkg/stringid"
        "github.com/docker/go-connections/nat"
        "github.com/moby/go-archive"
        "github.com/moby/go-archive/chrootarchive"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/build"
        "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/network"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

func (b *Builder) getArchiver() *archive.Archiver {
        return chrootarchive.NewArchiver(b.idMapping)
}

func (b *Builder) commit(ctx context.Context, dispatchState *dispatchState, comment string) error {
        if b.disableCommit {
                return nil
        }
        if !dispatchState.hasFromImage() {
                return errors.New("Please provide a source image with `from` prior to commit")
        }

        runConfigWithCommentCmd := copyRunConfig(dispatchState.runConfig, withCmdComment(comment, dispatchState.operatingSystem))
        id, err := b.probeAndCreate(ctx, dispatchState, runConfigWithCommentCmd)
        if err != nil || id == "" {
                return err
        }

        return b.commitContainer(ctx, dispatchState, id, runConfigWithCommentCmd)
}

func (b *Builder) commitContainer(ctx context.Context, dispatchState *dispatchState, id string, containerConfig *container.Config) error {
        if b.disableCommit {
                return nil
        }

        commitCfg := backend.CommitConfig{
                Author: dispatchState.maintainer,
                // TODO: this copy should be done by Commit()
                Config:          copyRunConfig(dispatchState.runConfig),
                ContainerConfig: containerConfig,
                ContainerID:     id,
        }

        imageID, err := b.docker.CommitBuildStep(ctx, commitCfg)
        dispatchState.imageID = string(imageID)
        return err
}

func (b *Builder) exportImage(ctx context.Context, state *dispatchState, layer builder.RWLayer, parent builder.Image, runConfig *container.Config) error {
        newLayer, err := layer.Commit()
        if err != nil {
                return err
        }

        parentImage, ok := parent.(*image.Image)
        if !ok {
                return errors.Errorf("unexpected image type")
        }

        platform := &ocispec.Platform{
                OS:           parentImage.OS,
                Architecture: parentImage.Architecture,
                Variant:      parentImage.Variant,
        }

        // add an image mount without an image so the layer is properly unmounted
        // if there is an error before we can add the full mount with image
        b.imageSources.Add(newImageMount(nil, newLayer), platform)

        newImage := image.NewChildImage(parentImage, image.ChildConfig{
                Author:          state.maintainer,
                ContainerConfig: runConfig,
                DiffID:          newLayer.DiffID(),
                Config:          copyRunConfig(state.runConfig),
        }, parentImage.OS)

        // TODO: it seems strange to marshal this here instead of just passing in the
        // image struct
        config, err := newImage.MarshalJSON()
        if err != nil {
                return errors.Wrap(err, "failed to encode image config")
        }

        // when writing the new image's manifest, we now need to pass in the new layer's digest.
        // before the containerd store work this was unnecessary since we get the layer id
        // from the image's RootFS ChainID -- see:
        // https://github.com/moby/moby/blob/8cf66ed7322fa885ef99c4c044fa23e1727301dc/image/store.go#L162
        // however, with the containerd store we can't do this. An alternative implementation here
        // without changing the signature would be to get the layer digest by walking the content store
        // and filtering the objects to find the layer with the DiffID we want, but that has performance
        // implications that should be called out/investigated
        exportedImage, err := b.docker.CreateImage(ctx, config, state.imageID, newLayer.ContentStoreDigest())
        if err != nil {
                return errors.Wrapf(err, "failed to export image")
        }

        state.imageID = exportedImage.ImageID()
        b.imageSources.Add(newImageMount(exportedImage, newLayer), platform)
        return nil
}

func (b *Builder) performCopy(ctx context.Context, req dispatchRequest, inst copyInstruction) error {
        state := req.state
        srcHash := getSourceHashFromInfos(inst.infos)

        var chownComment string
        if inst.chownStr != "" {
                chownComment = fmt.Sprintf("--chown=%s ", inst.chownStr)
        }
        commentStr := fmt.Sprintf("%s %s%s in %s ", inst.cmdName, chownComment, srcHash, inst.dest)

        // TODO: should this have been using origPaths instead of srcHash in the comment?
        runConfigWithCommentCmd := copyRunConfig(state.runConfig, withCmdCommentString(commentStr, state.operatingSystem))
        hit, err := b.probeCache(state, runConfigWithCommentCmd)
        if err != nil || hit {
                return err
        }

        imgMount, err := b.imageSources.Get(ctx, state.imageID, true, req.builder.platform)
        if err != nil {
                return errors.Wrapf(err, "failed to get destination image %q", state.imageID)
        }

        rwLayer, err := imgMount.NewRWLayer()
        if err != nil {
                return err
        }
        defer rwLayer.Release()

        destInfo, err := createDestInfo(state.runConfig.WorkingDir, inst, rwLayer)
        if err != nil {
                return err
        }

        uid, gid := b.idMapping.RootPair()
        id := identity{UID: uid, GID: gid}
        // if a chown was requested, perform the steps to get the uid, gid
        // translated (if necessary because of user namespaces), and replace
        // the root pair with the chown pair for copy operations
        if inst.chownStr != "" {
                id, err = parseChownFlag(ctx, b, state, inst.chownStr, destInfo.root, b.idMapping)
                if err != nil {
                        if b.options.Platform != "windows" {
                                return errors.Wrapf(err, "unable to convert uid/gid chown string to host mapping")
                        }

                        return errors.Wrapf(err, "unable to map container user account name to SID")
                }
        }

        for _, info := range inst.infos {
                opts := copyFileOptions{
                        decompress: inst.allowLocalDecompression,
                        archiver:   b.getArchiver(),
                }
                if !inst.preserveOwnership {
                        opts.identity = &id
                }
                if err := performCopyForInfo(destInfo, info, opts); err != nil {
                        return errors.Wrapf(err, "failed to copy files")
                }
        }
        return b.exportImage(ctx, state, rwLayer, imgMount.Image(), runConfigWithCommentCmd)
}

func createDestInfo(workingDir string, inst copyInstruction, rwLayer builder.RWLayer) (copyInfo, error) {
        // Twiddle the destination when it's a relative path - meaning, make it
        // relative to the WORKINGDIR
        dest, err := normalizeDest(workingDir, inst.dest)
        if err != nil {
                return copyInfo{}, errors.Wrapf(err, "invalid %s", inst.cmdName)
        }

        return copyInfo{root: rwLayer.Root(), path: dest}, nil
}

// For backwards compat, if there's just one info then use it as the
// cache look-up string, otherwise hash 'em all into one
func getSourceHashFromInfos(infos []copyInfo) string {
        if len(infos) == 1 {
                return infos[0].hash
        }
        var hashs []string
        for _, info := range infos {
                hashs = append(hashs, info.hash)
        }
        return hashStringSlice("multi", hashs)
}

func hashStringSlice(prefix string, slice []string) string {
        hasher := sha256.New()
        hasher.Write([]byte(strings.Join(slice, ",")))
        return prefix + ":" + hex.EncodeToString(hasher.Sum(nil))
}

type runConfigModifier func(*container.Config)

func withCmd(cmd []string) runConfigModifier {
        return func(runConfig *container.Config) {
                runConfig.Cmd = cmd
        }
}

func withArgsEscaped(argsEscaped bool) runConfigModifier {
        return func(runConfig *container.Config) {
                runConfig.ArgsEscaped = argsEscaped
        }
}

// withCmdComment sets Cmd to a nop comment string. See withCmdCommentString for
// why there are two almost identical versions of this.
func withCmdComment(comment string, platform string) runConfigModifier {
        return func(runConfig *container.Config) {
                runConfig.Cmd = append(getShell(runConfig, platform), "#(nop) ", comment)
        }
}

// withCmdCommentString exists to maintain compatibility with older versions.
// A few instructions (workdir, copy, add) used a nop comment that is a single arg
// where as all the other instructions used a two arg comment string. This
// function implements the single arg version.
func withCmdCommentString(comment string, platform string) runConfigModifier {
        return func(runConfig *container.Config) {
                runConfig.Cmd = append(getShell(runConfig, platform), "#(nop) "+comment)
        }
}

func withEnv(env []string) runConfigModifier {
        return func(runConfig *container.Config) {
                runConfig.Env = env
        }
}

// withEntrypointOverride sets an entrypoint on runConfig if the command is
// not empty. The entrypoint is left unmodified if command is empty.
//
// The dockerfile RUN instruction expect to run without an entrypoint
// so the runConfig entrypoint needs to be modified accordingly. ContainerCreate
// will change a []string{""} entrypoint to nil, so we probe the cache with the
// nil entrypoint.
func withEntrypointOverride(cmd []string, entrypoint []string) runConfigModifier {
        return func(runConfig *container.Config) {
                if len(cmd) > 0 {
                        runConfig.Entrypoint = entrypoint
                }
        }
}

// withoutHealthcheck disables healthcheck.
//
// The dockerfile RUN instruction expect to run without healthcheck
// so the runConfig Healthcheck needs to be disabled.
func withoutHealthcheck() runConfigModifier {
        return func(runConfig *container.Config) {
                runConfig.Healthcheck = &container.HealthConfig{
                        Test: []string{"NONE"},
                }
        }
}

func copyRunConfig(runConfig *container.Config, modifiers ...runConfigModifier) *container.Config {
        cfgCopy := *runConfig
        cfgCopy.Cmd = copyStringSlice(runConfig.Cmd)
        cfgCopy.Env = copyStringSlice(runConfig.Env)
        cfgCopy.Entrypoint = copyStringSlice(runConfig.Entrypoint)
        cfgCopy.OnBuild = copyStringSlice(runConfig.OnBuild)
        cfgCopy.Shell = copyStringSlice(runConfig.Shell)

        if cfgCopy.Volumes != nil {
                cfgCopy.Volumes = make(map[string]struct{}, len(runConfig.Volumes))
                for k, v := range runConfig.Volumes {
                        cfgCopy.Volumes[k] = v
                }
        }

        if cfgCopy.ExposedPorts != nil {
                cfgCopy.ExposedPorts = make(nat.PortSet, len(runConfig.ExposedPorts))
                for k, v := range runConfig.ExposedPorts {
                        cfgCopy.ExposedPorts[k] = v
                }
        }

        if cfgCopy.Labels != nil {
                cfgCopy.Labels = make(map[string]string, len(runConfig.Labels))
                for k, v := range runConfig.Labels {
                        cfgCopy.Labels[k] = v
                }
        }

        for _, modifier := range modifiers {
                modifier(&cfgCopy)
        }
        return &cfgCopy
}

func copyStringSlice(orig []string) []string {
        if orig == nil {
                return nil
        }
        return append([]string{}, orig...)
}

// getShell is a helper function which gets the right shell for prefixing the
// shell-form of RUN, ENTRYPOINT and CMD instructions
func getShell(c *container.Config, os string) []string {
        if len(c.Shell) == 0 {
                return append([]string{}, defaultShellForOS(os)[:]...)
        }
        return append([]string{}, c.Shell[:]...)
}

func (b *Builder) probeCache(dispatchState *dispatchState, runConfig *container.Config) (bool, error) {
        cachedID, err := b.imageProber.Probe(dispatchState.imageID, runConfig, b.getPlatform(dispatchState))
        if cachedID == "" || err != nil {
                return false, err
        }
        _, _ = fmt.Fprintln(b.Stdout, " ---> Using cache")

        dispatchState.imageID = cachedID
        return true, nil
}

var defaultLogConfig = container.LogConfig{Type: "none"}

func (b *Builder) probeAndCreate(ctx context.Context, dispatchState *dispatchState, runConfig *container.Config) (string, error) {
        if hit, err := b.probeCache(dispatchState, runConfig); err != nil || hit {
                return "", err
        }
        return b.create(ctx, runConfig)
}

func (b *Builder) create(ctx context.Context, runConfig *container.Config) (string, error) {
        log.G(ctx).Debugf("[BUILDER] Command to be executed: %v", runConfig.Cmd)

        hostConfig := hostConfigFromOptions(b.options)
        ctr, err := b.containerManager.Create(ctx, runConfig, hostConfig)
        if err != nil {
                return "", err
        }
        for _, warning := range ctr.Warnings {
                _, _ = fmt.Fprintf(b.Stdout, " ---> [Warning] %s\n", warning)
        }
        _, _ = fmt.Fprintf(b.Stdout, " ---> Running in %s\n", stringid.TruncateID(ctr.ID))
        return ctr.ID, nil
}

func hostConfigFromOptions(options *build.ImageBuildOptions) *container.HostConfig {
        resources := container.Resources{
                CgroupParent: options.CgroupParent,
                CPUShares:    options.CPUShares,
                CPUPeriod:    options.CPUPeriod,
                CPUQuota:     options.CPUQuota,
                CpusetCpus:   options.CPUSetCPUs,
                CpusetMems:   options.CPUSetMems,
                Memory:       options.Memory,
                MemorySwap:   options.MemorySwap,
                Ulimits:      options.Ulimits,
        }

        // We need to make sure no empty string or "default" NetworkMode is
        // provided to the daemon as it doesn't support them.
        //
        // This is in line with what the ContainerCreate API endpoint does.
        networkMode := options.NetworkMode
        if networkMode == "" || networkMode == network.NetworkDefault {
                networkMode = networkSettings.DefaultNetwork
        }

        hc := &container.HostConfig{
                SecurityOpt: options.SecurityOpt,
                Isolation:   options.Isolation,
                ShmSize:     options.ShmSize,
                Resources:   resources,
                NetworkMode: container.NetworkMode(networkMode),
                // Set a log config to override any default value set on the daemon
                LogConfig:  defaultLogConfig,
                ExtraHosts: options.ExtraHosts,
        }
        return hc
}

func (b *Builder) getPlatform(state *dispatchState) ocispec.Platform {
        // May be nil if not explicitly set in API/dockerfile
        out := platforms.DefaultSpec()
        if b.platform != nil {
                out = *b.platform
        }

        if state.operatingSystem != "" {
                out.OS = state.operatingSystem
        }

        return out
}

package dockerfile

import (
        "context"
        "path/filepath"
        "strconv"
        "strings"

        "github.com/moby/sys/symlink"
        "github.com/moby/sys/user"
        "github.com/pkg/errors"
)

func parseChownFlag(ctx context.Context, builder *Builder, state *dispatchState, chown, ctrRootPath string, identityMapping user.IdentityMapping) (identity, error) {
        var userStr, grpStr string
        parts := strings.Split(chown, ":")
        if len(parts) > 2 {
                return identity{}, errors.New("invalid chown string format: " + chown)
        }
        if len(parts) == 1 {
                // if no group specified, use the user spec as group as well
                userStr, grpStr = parts[0], parts[0]
        } else {
                userStr, grpStr = parts[0], parts[1]
        }

        passwdPath, err := symlink.FollowSymlinkInScope(filepath.Join(ctrRootPath, "etc", "passwd"), ctrRootPath)
        if err != nil {
                return identity{}, errors.Wrap(err, "can't resolve /etc/passwd path in container rootfs")
        }
        groupPath, err := symlink.FollowSymlinkInScope(filepath.Join(ctrRootPath, "etc", "group"), ctrRootPath)
        if err != nil {
                return identity{}, errors.Wrap(err, "can't resolve /etc/group path in container rootfs")
        }
        uid, err := lookupUser(userStr, passwdPath)
        if err != nil {
                return identity{}, errors.Wrap(err, "can't find uid for user "+userStr)
        }
        gid, err := lookupGroup(grpStr, groupPath)
        if err != nil {
                return identity{}, errors.Wrap(err, "can't find gid for group "+grpStr)
        }

        // convert as necessary because of user namespaces
        uid, gid, err = identityMapping.ToHost(uid, gid)
        if err != nil {
                return identity{}, errors.Wrap(err, "unable to convert uid/gid to host mapping")
        }
        return identity{UID: uid, GID: gid}, nil
}

func lookupUser(userStr, filepath string) (int, error) {
        // if the string is actually a uid integer, parse to int and return
        // as we don't need to translate with the help of files
        uid, err := strconv.Atoi(userStr)
        if err == nil {
                return uid, nil
        }
        users, err := user.ParsePasswdFileFilter(filepath, func(u user.User) bool {
                return u.Name == userStr
        })
        if err != nil {
                return 0, err
        }
        if len(users) == 0 {
                return 0, errors.New("no such user: " + userStr)
        }
        return users[0].Uid, nil
}

func lookupGroup(groupStr, filepath string) (int, error) {
        // if the string is actually a gid integer, parse to int and return
        // as we don't need to translate with the help of files
        gid, err := strconv.Atoi(groupStr)
        if err == nil {
                return gid, nil
        }
        groups, err := user.ParseGroupFileFilter(filepath, func(g user.Group) bool {
                return g.Name == groupStr
        })
        if err != nil {
                return 0, err
        }
        if len(groups) == 0 {
                return 0, errors.New("no such group: " + groupStr)
        }
        return groups[0].Gid, nil
}

package dockerfile

import (
        gometrics "github.com/docker/go-metrics"
)

var (
        buildsTriggered gometrics.Counter
        buildsFailed    gometrics.LabeledCounter
)

// Build metrics prometheus messages, these values must be initialized before
// using them. See the example below in the "builds_failed" metric definition.
const (
        metricsDockerfileSyntaxError        = "dockerfile_syntax_error"
        metricsDockerfileEmptyError         = "dockerfile_empty_error"
        metricsCommandNotSupportedError     = "command_not_supported_error"
        metricsErrorProcessingCommandsError = "error_processing_commands_error"
        metricsBuildTargetNotReachableError = "build_target_not_reachable_error"
        metricsMissingOnbuildArgumentsError = "missing_onbuild_arguments_error"
        metricsUnknownInstructionError      = "unknown_instruction_error"
        metricsBuildCanceled                = "build_canceled"
)

func init() {
        buildMetrics := gometrics.NewNamespace("builder", "", nil)

        buildsTriggered = buildMetrics.NewCounter("builds_triggered", "Number of triggered image builds")
        buildsFailed = buildMetrics.NewLabeledCounter("builds_failed", "Number of failed image builds", "reason")
        for _, r := range []string{
                metricsDockerfileSyntaxError,
                metricsDockerfileEmptyError,
                metricsCommandNotSupportedError,
                metricsErrorProcessingCommandsError,
                metricsBuildTargetNotReachableError,
                metricsMissingOnbuildArgumentsError,
                metricsUnknownInstructionError,
                metricsBuildCanceled,
        } {
                buildsFailed.WithValues(r)
        }

        gometrics.Register(buildMetrics)
}

package remotecontext

import (
        "io"
        "os"
        "path/filepath"

        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/pkg/longpath"
        "github.com/docker/docker/pkg/system"
        "github.com/docker/docker/pkg/tarsum"
        "github.com/moby/go-archive/chrootarchive"
        "github.com/moby/go-archive/compression"
        "github.com/moby/sys/symlink"
        "github.com/pkg/errors"
)

type archiveContext struct {
        root string
        sums tarsum.FileInfoSums
}

func (c *archiveContext) Close() error {
        return os.RemoveAll(c.root)
}

func convertPathError(err error, cleanpath string) error {
        switch err := err.(type) {
        case *os.PathError:
                err.Path = cleanpath
        case *system.XattrError:
                err.Path = cleanpath
        }
        return err
}

type modifiableContext interface {
        builder.Source
        // Remove deletes the entry specified by `path`.
        // It is usual for directory entries to delete all its subentries.
        Remove(path string) error
}

// FromArchive returns a build source from a tar stream.
//
// It extracts the tar stream to a temporary folder that is deleted as soon as
// the Context is closed.
// As the extraction happens, a tarsum is calculated for every file, and the set of
// all those sums then becomes the source of truth for all operations on this Context.
//
// Closing tarStream has to be done by the caller.
func FromArchive(tarStream io.Reader) (builder.Source, error) {
        root, err := longpath.MkdirTemp("", "docker-builder")
        if err != nil {
                return nil, err
        }

        // Assume local file system. Since it's coming from a tar file.
        tsc := &archiveContext{root: root}

        // Make sure we clean-up upon error.  In the happy case the caller
        // is expected to manage the clean-up
        defer func() {
                if err != nil {
                        tsc.Close()
                }
        }()

        decompressedStream, err := compression.DecompressStream(tarStream)
        if err != nil {
                return nil, err
        }

        sum, err := tarsum.NewTarSum(decompressedStream, true, tarsum.Version1)
        if err != nil {
                return nil, err
        }

        err = chrootarchive.Untar(sum, root, nil)
        if err != nil {
                return nil, err
        }

        tsc.sums = sum.GetSums()
        return tsc, nil
}

func (c *archiveContext) Root() string {
        return c.root
}

func (c *archiveContext) Remove(path string) error {
        _, fullpath, err := normalize(path, c.root)
        if err != nil {
                return err
        }
        return os.RemoveAll(fullpath)
}

func (c *archiveContext) Hash(path string) (string, error) {
        cleanpath, fullpath, err := normalize(path, c.root)
        if err != nil {
                return "", err
        }

        rel, err := filepath.Rel(c.root, fullpath)
        if err != nil {
                return "", convertPathError(err, cleanpath)
        }

        // Use the checksum of the followed path(not the possible symlink) because
        // this is the file that is actually copied.
        if tsInfo := c.sums.GetFile(filepath.ToSlash(rel)); tsInfo != nil {
                return tsInfo.Sum(), nil
        }
        // We set sum to path by default for the case where GetFile returns nil.
        // The usual case is if relative path is empty.
        return path, nil // backwards compat TODO: see if really needed
}

func normalize(path string, root string) (cleanPath, fullPath string, _ error) {
        cleanPath = filepath.Clean(string(filepath.Separator) + path)[1:]
        fullPath, err := symlink.FollowSymlinkInScope(filepath.Join(root, path), root)
        if err != nil {
                return "", "", errors.Wrapf(err, "forbidden path outside the build context: %s (%s)", path, cleanPath)
        }
        return cleanPath, fullPath, nil
}

package remotecontext

import (
        "bufio"
        "context"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "runtime"
        "strings"

        "github.com/containerd/continuity/driver"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/daemon/builder/remotecontext/urlutil"
        "github.com/docker/docker/errdefs"
        "github.com/moby/buildkit/frontend/dockerfile/parser"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/patternmatcher"
        "github.com/moby/patternmatcher/ignorefile"
        "github.com/moby/sys/symlink"
        "github.com/pkg/errors"
)

// ClientSessionRemote is identifier for client-session context transport
const ClientSessionRemote = "client-session"

// Detect returns a context and dockerfile from remote location or local
// archive.
func Detect(config backend.BuildConfig) (remote builder.Source, dockerfile *parser.Result, _ error) {
        remoteURL := config.Options.RemoteContext
        switch {
        case remoteURL == "":
                return newArchiveRemote(config.Source, config.Options.Dockerfile)
        case remoteURL == ClientSessionRemote:
                return nil, nil, errdefs.InvalidParameter(errors.New("experimental session with v1 builder is no longer supported, use builder version v2 (BuildKit) instead"))
        case urlutil.IsGitURL(remoteURL):
                return newGitRemote(remoteURL, config.Options.Dockerfile)
        case urlutil.IsURL(remoteURL):
                return newURLRemote(remoteURL, config.Options.Dockerfile, config.ProgressWriter.ProgressReaderFunc)
        default:
                return nil, nil, fmt.Errorf("remoteURL (%s) could not be recognized as URL", remoteURL)
        }
}

func newArchiveRemote(rc io.ReadCloser, dockerfilePath string) (builder.Source, *parser.Result, error) {
        defer rc.Close()
        c, err := FromArchive(rc)
        if err != nil {
                return nil, nil, err
        }

        return withDockerfileFromContext(c.(modifiableContext), dockerfilePath)
}

func withDockerfileFromContext(c modifiableContext, dockerfilePath string) (builder.Source, *parser.Result, error) {
        df, err := openAt(c, dockerfilePath)
        if err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        if dockerfilePath == builder.DefaultDockerfileName {
                                lowercase := strings.ToLower(dockerfilePath)
                                if _, err := StatAt(c, lowercase); err == nil {
                                        return withDockerfileFromContext(c, lowercase)
                                }
                        }
                        return nil, nil, errors.Errorf("Cannot locate specified Dockerfile: %s", dockerfilePath) // backwards compatible error
                }
                c.Close()
                return nil, nil, err
        }

        res, err := readAndParseDockerfile(dockerfilePath, df)
        if err != nil {
                return nil, nil, err
        }

        df.Close()

        if err := removeDockerfile(c, dockerfilePath); err != nil {
                c.Close()
                return nil, nil, err
        }

        return c, res, nil
}

func newGitRemote(gitURL string, dockerfilePath string) (builder.Source, *parser.Result, error) {
        c, err := MakeGitContext(gitURL) // TODO: change this to NewLazySource
        if err != nil {
                return nil, nil, err
        }
        return withDockerfileFromContext(c.(modifiableContext), dockerfilePath)
}

func newURLRemote(url string, dockerfilePath string, progressReader func(in io.ReadCloser) io.ReadCloser) (builder.Source, *parser.Result, error) {
        contentType, content, err := downloadRemote(url)
        if err != nil {
                return nil, nil, err
        }
        defer content.Close()

        switch contentType {
        case mimeTypeTextPlain:
                res, err := parser.Parse(progressReader(content))
                return nil, res, errdefs.InvalidParameter(err)
        default:
                source, err := FromArchive(progressReader(content))
                if err != nil {
                        return nil, nil, err
                }
                return withDockerfileFromContext(source.(modifiableContext), dockerfilePath)
        }
}

func removeDockerfile(c modifiableContext, filesToRemove ...string) error {
        f, err := openAt(c, ".dockerignore")
        // Note that a missing .dockerignore file isn't treated as an error
        switch {
        case os.IsNotExist(err):
                return nil
        case err != nil:
                return err
        }
        excludes, err := ignorefile.ReadAll(f)
        if err != nil {
                f.Close()
                return errors.Wrap(err, "error reading .dockerignore")
        }
        f.Close()
        filesToRemove = append([]string{".dockerignore"}, filesToRemove...)
        for _, fileToRemove := range filesToRemove {
                if rm, _ := patternmatcher.MatchesOrParentMatches(fileToRemove, excludes); rm {
                        if err := c.Remove(fileToRemove); err != nil {
                                log.G(context.TODO()).Errorf("failed to remove %s: %v", fileToRemove, err)
                        }
                }
        }
        return nil
}

func readAndParseDockerfile(name string, rc io.Reader) (*parser.Result, error) {
        br := bufio.NewReader(rc)
        if _, err := br.Peek(1); err != nil {
                if err == io.EOF {
                        return nil, errdefs.InvalidParameter(errors.Errorf("the Dockerfile (%s) cannot be empty", name))
                }
                return nil, errors.Wrap(err, "unexpected error reading Dockerfile")
        }

        dockerfile, err := parser.Parse(br)
        if err != nil {
                return nil, errdefs.InvalidParameter(errors.Wrapf(err, "failed to parse %s", name))
        }

        return dockerfile, nil
}

func openAt(remote builder.Source, path string) (driver.File, error) {
        fullPath, err := FullPath(remote, path)
        if err != nil {
                return nil, err
        }
        return os.Open(fullPath)
}

// StatAt is a helper for calling Stat on a path from a source
func StatAt(remote builder.Source, path string) (os.FileInfo, error) {
        fullPath, err := FullPath(remote, path)
        if err != nil {
                return nil, err
        }
        return os.Stat(fullPath)
}

// FullPath is a helper for getting a full path for a path from a source
func FullPath(remote builder.Source, path string) (string, error) {
        remoteRoot := remote.Root()
        fullPath, err := symlink.FollowSymlinkInScope(filepath.Join(remoteRoot, path), remoteRoot)
        if err != nil {
                if runtime.GOOS == "windows" {
                        return "", fmt.Errorf("failed to resolve scoped path %s (%s): %s. Possible cause is a forbidden path outside the build context", path, fullPath, err)
                }
                return "", fmt.Errorf("forbidden path outside the build context: %s (%s)", path, fullPath) // backwards compat with old error
        }
        return fullPath, nil
}

package remotecontext

import (
        "archive/tar"
        "crypto/sha256"
        "hash"
        "os"

        "github.com/docker/docker/pkg/tarsum"
        "github.com/moby/go-archive"
)

// NewFileHash returns new hash that is used for the builder cache keys
func NewFileHash(path, name string, fi os.FileInfo) (hash.Hash, error) {
        var link string
        if fi.Mode()&os.ModeSymlink != 0 {
                var err error
                link, err = os.Readlink(path)
                if err != nil {
                        return nil, err
                }
        }
        hdr, err := archive.FileInfoHeader(name, fi, link)
        if err != nil {
                return nil, err
        }
        if err := archive.ReadSecurityXattrToTarHeader(path, hdr); err != nil {
                return nil, err
        }
        tsh := &tarsumHash{hdr: hdr, Hash: sha256.New()}
        tsh.Reset() // initialize header
        return tsh, nil
}

type tarsumHash struct {
        hash.Hash
        hdr *tar.Header
}

// Reset resets the Hash to its initial state.
func (tsh *tarsumHash) Reset() {
        // comply with hash.Hash and reset to the state hash had before any writes
        tsh.Hash.Reset()
        tarsum.WriteV1Header(tsh.hdr, tsh.Hash)
}

package remotecontext

import (
        "context"
        "os"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/daemon/builder/remotecontext/git"
        "github.com/moby/go-archive"
)

// MakeGitContext returns a Context from gitURL that is cloned in a temporary directory.
func MakeGitContext(gitURL string) (builder.Source, error) {
        root, err := git.Clone(gitURL, git.WithIsolatedConfig(true))
        if err != nil {
                return nil, err
        }

        c, err := archive.Tar(root, archive.Uncompressed)
        if err != nil {
                return nil, err
        }

        defer func() {
                if err := c.Close(); err != nil {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "error":  err,
                                "action": "MakeGitContext",
                                "module": "builder",
                                "url":    gitURL,
                        }).Error("error while closing git context")
                }
                if err := os.RemoveAll(root); err != nil {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "error":  err,
                                "action": "MakeGitContext",
                                "module": "builder",
                                "url":    gitURL,
                        }).Error("error while removing path and children of root")
                }
        }()
        return FromArchive(c)
}

package git

import (
        "net/http"
        "net/url"
        "os"
        "os/exec"
        "path/filepath"
        "strings"

        "github.com/moby/sys/symlink"
        "github.com/pkg/errors"
)

type gitRepo struct {
        remote string
        ref    string
        subdir string

        isolateConfig bool
}

// CloneOption changes the behaviour of Clone().
type CloneOption func(*gitRepo)

// WithIsolatedConfig disables reading the user or system gitconfig files when
// performing Git operations.
func WithIsolatedConfig(v bool) CloneOption {
        return func(gr *gitRepo) {
                gr.isolateConfig = v
        }
}

// Clone clones a repository into a newly created directory which
// will be under "docker-build-git"
func Clone(remoteURL string, opts ...CloneOption) (string, error) {
        repo, err := parseRemoteURL(remoteURL)
        if err != nil {
                return "", err
        }

        for _, opt := range opts {
                opt(&repo)
        }

        return repo.clone()
}

func (repo gitRepo) clone() (checkoutDir string, retErr error) {
        fetch := fetchArgs(repo.remote, repo.ref)

        root, err := os.MkdirTemp("", "docker-build-git")
        if err != nil {
                return "", err
        }

        defer func() {
                if retErr != nil {
                        _ = os.RemoveAll(root)
                }
        }()

        if out, err := repo.gitWithinDir(root, "init"); err != nil {
                return "", errors.Wrapf(err, "failed to init repo at %s: %s", root, out)
        }

        // Add origin remote for compatibility with previous implementation that
        // used "git clone" and also to make sure local refs are created for branches
        if out, err := repo.gitWithinDir(root, "remote", "add", "origin", repo.remote); err != nil {
                return "", errors.Wrapf(err, "failed add origin repo at %s: %s", repo.remote, out)
        }

        if output, err := repo.gitWithinDir(root, fetch...); err != nil {
                return "", errors.Wrapf(err, "error fetching: %s", output)
        }

        checkoutDir, err = repo.checkout(root)
        if err != nil {
                return "", err
        }

        cmd := exec.Command("git", "submodule", "update", "--init", "--recursive", "--depth=1")
        cmd.Dir = root
        output, err := cmd.CombinedOutput()
        if err != nil {
                return "", errors.Wrapf(err, "error initializing submodules: %s", output)
        }

        return checkoutDir, nil
}

func parseRemoteURL(remoteURL string) (gitRepo, error) {
        repo := gitRepo{}

        if !isGitTransport(remoteURL) {
                remoteURL = "https://" + remoteURL
        }

        if strings.HasPrefix(remoteURL, "git@") {
                // git@.. is not an URL, so cannot be parsed as URL
                var fragment string
                repo.remote, fragment, _ = strings.Cut(remoteURL, "#")
                repo.ref, repo.subdir = getRefAndSubdir(fragment)
        } else {
                u, err := url.Parse(remoteURL)
                if err != nil {
                        return repo, err
                }

                repo.ref, repo.subdir = getRefAndSubdir(u.Fragment)
                u.Fragment = ""
                repo.remote = u.String()
        }

        if strings.HasPrefix(repo.ref, "-") {
                return gitRepo{}, errors.Errorf("invalid refspec: %s", repo.ref)
        }

        return repo, nil
}

func getRefAndSubdir(fragment string) (ref string, subdir string) {
        ref, subdir, _ = strings.Cut(fragment, ":")
        if ref == "" {
                ref = "master"
        }
        return ref, subdir
}

func fetchArgs(remoteURL string, ref string) []string {
        args := []string{"fetch"}

        if supportsShallowClone(remoteURL) {
                args = append(args, "--depth", "1")
        }

        return append(args, "origin", "--", ref)
}

// Check if a given git URL supports a shallow git clone,
// i.e. it is a non-HTTP server or a smart HTTP server.
func supportsShallowClone(remoteURL string) bool {
        if scheme := getScheme(remoteURL); scheme == "http" || scheme == "https" {
                // Check if the HTTP server is smart

                // Smart servers must correctly respond to a query for the git-upload-pack service
                serviceURL := remoteURL + "/info/refs?service=git-upload-pack"

                // Try a HEAD request and fallback to a Get request on error
                res, err := http.Head(serviceURL) // #nosec G107
                if res != nil && res.Body != nil {
                        defer res.Body.Close()
                }
                if err != nil || res.StatusCode != http.StatusOK {
                        res, err = http.Get(serviceURL) // #nosec G107
                        if err == nil {
                                res.Body.Close()
                        }
                        if err != nil || res.StatusCode != http.StatusOK {
                                // request failed
                                return false
                        }
                }

                if res.Header.Get("Content-Type") != "application/x-git-upload-pack-advertisement" {
                        // Fallback, not a smart server
                        return false
                }
                return true
        }
        // Non-HTTP protocols always support shallow clones
        return true
}

func (repo gitRepo) checkout(root string) (string, error) {
        // Try checking out by ref name first. This will work on branches and sets
        // .git/HEAD to the current branch name
        if output, err := repo.gitWithinDir(root, "checkout", repo.ref); err != nil {
                // If checking out by branch name fails check out the last fetched ref
                if _, err2 := repo.gitWithinDir(root, "checkout", "FETCH_HEAD"); err2 != nil {
                        return "", errors.Wrapf(err, "error checking out %s: %s", repo.ref, output)
                }
        }

        if repo.subdir != "" {
                newCtx, err := symlink.FollowSymlinkInScope(filepath.Join(root, repo.subdir), root)
                if err != nil {
                        return "", errors.Wrapf(err, "error setting git context, %q not within git root", repo.subdir)
                }

                fi, err := os.Stat(newCtx)
                if err != nil {
                        return "", err
                }
                if !fi.IsDir() {
                        return "", errors.Errorf("error setting git context, not a directory: %s", newCtx)
                }
                root = newCtx
        }

        return root, nil
}

func (repo gitRepo) gitWithinDir(dir string, args ...string) ([]byte, error) {
        args = append([]string{"-c", "protocol.file.allow=never"}, args...) // Block sneaky repositories from using repos from the filesystem as submodules.
        cmd := exec.Command("git", args...)
        cmd.Dir = dir
        // Disable unsafe remote protocols.
        cmd.Env = append(os.Environ(), "GIT_PROTOCOL_FROM_USER=0")

        if repo.isolateConfig {
                cmd.Env = append(cmd.Env,
                        "GIT_CONFIG_NOSYSTEM=1", // Disable reading from system gitconfig.
                        "HOME=/dev/null",        // Disable reading from user gitconfig.
                )
        }

        return cmd.CombinedOutput()
}

// isGitTransport returns true if the provided str is a git transport by inspecting
// the prefix of the string for known protocols used in git.
func isGitTransport(str string) bool {
        if strings.HasPrefix(str, "git@") {
                return true
        }

        switch getScheme(str) {
        case "git", "http", "https", "ssh":
                return true
        }

        return false
}

// getScheme returns addresses' scheme in lowercase, or an empty
// string in case address is an invalid URL.
func getScheme(address string) string {
        u, err := url.Parse(address)
        if err != nil {
                return ""
        }
        return u.Scheme
}

package remotecontext

import (
        "encoding/hex"
        "os"
        "path/filepath"

        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/pkg/pools"
        "github.com/pkg/errors"
)

// NewLazySource creates a new LazyContext. LazyContext defines a hashed build
// context based on a root directory. Individual files are hashed first time
// they are asked. It is not safe to call methods of LazyContext concurrently.
func NewLazySource(root string) (builder.Source, error) {
        return &lazySource{
                root: root,
                sums: make(map[string]string),
        }, nil
}

type lazySource struct {
        root string
        sums map[string]string
}

func (c *lazySource) Root() string {
        return c.root
}

func (c *lazySource) Close() error {
        return nil
}

func (c *lazySource) Hash(path string) (string, error) {
        cleanPath, fullPath, err := normalize(path, c.root)
        if err != nil {
                return "", err
        }

        relPath, err := filepath.Rel(c.root, fullPath)
        if err != nil {
                return "", errors.WithStack(convertPathError(err, cleanPath))
        }

        fi, err := os.Lstat(fullPath)
        if err != nil {
                // Backwards compatibility: a missing file returns a path as hash.
                // This is reached in the case of a broken symlink.
                return relPath, nil
        }

        sum, ok := c.sums[relPath]
        if !ok {
                sum, err = c.prepareHash(relPath, fi)
                if err != nil {
                        return "", err
                }
        }

        return sum, nil
}

func (c *lazySource) prepareHash(relPath string, fi os.FileInfo) (string, error) {
        p := filepath.Join(c.root, relPath)
        h, err := NewFileHash(p, relPath, fi)
        if err != nil {
                return "", errors.Wrapf(err, "failed to create hash for %s", relPath)
        }
        if fi.Mode().IsRegular() && fi.Size() > 0 {
                f, err := os.Open(p)
                if err != nil {
                        return "", errors.Wrapf(err, "failed to open %s", relPath)
                }
                defer f.Close()
                if _, err := pools.Copy(h, f); err != nil {
                        return "", errors.Wrapf(err, "failed to copy file data for %s", relPath)
                }
        }
        sum := hex.EncodeToString(h.Sum(nil))
        c.sums[relPath] = sum
        return sum, nil
}

// Rel is an alias for [filepath.Rel].
//
// Deprecated: use [filepath.Rel] instead; this function is no longer used and will be removed in the next release.
func Rel(basepath string, targpath string) (string, error) {
        return filepath.Rel(basepath, targpath)
}

package remotecontext

import (
        "mime"
        "net/http"
)

// MIME content types.
const (
        mimeTypeTextPlain   = "text/plain"
        mimeTypeOctetStream = "application/octet-stream"
)

// detectContentType returns a best guess representation of the MIME
// content type for the bytes at c.  The value detected by
// http.DetectContentType is guaranteed not be nil, defaulting to
// application/octet-stream when a better guess cannot be made. The
// result of this detection is then run through mime.ParseMediaType()
// which separates the actual MIME string from any parameters.
func detectContentType(c []byte) (string, error) {
        contentType, _, err := mime.ParseMediaType(http.DetectContentType(c))
        if err != nil {
                return "", err
        }
        return contentType, nil
}

package remotecontext

import (
        "bytes"
        "fmt"
        "io"
        "net"
        "net/http"
        "net/url"

        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/internal/lazyregexp"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/pkg/errors"
)

// When downloading remote contexts, limit the amount (in bytes)
// to be read from the response body in order to detect its Content-Type
const maxPreambleLength = 100

const acceptableRemoteMIME = `(?:application/(?:(?:x\-)?tar|octet\-stream|((?:x\-)?(?:gzip|bzip2?|xz)))|(?:text/plain))`

var mimeRe = lazyregexp.New(acceptableRemoteMIME)

// downloadRemote context from a url and returns it, along with the parsed content type
func downloadRemote(remoteURL string) (string, io.ReadCloser, error) {
        response, err := GetWithStatusError(remoteURL)
        if err != nil {
                return "", nil, errors.Wrapf(err, "error downloading remote context %s", remoteURL)
        }

        contentType, contextReader, err := inspectResponse(
                response.Header.Get("Content-Type"),
                response.Body,
                response.ContentLength)
        if err != nil {
                response.Body.Close()
                return "", nil, errors.Wrapf(err, "error detecting content type for remote %s", remoteURL)
        }

        return contentType, ioutils.NewReadCloserWrapper(contextReader, response.Body.Close), nil
}

// GetWithStatusError does an http.Get() and returns an error if the
// status code is 4xx or 5xx.
func GetWithStatusError(address string) (*http.Response, error) {
        resp, err := http.Get(address) // #nosec G107 -- ignore G107: Potential HTTP request made with variable url
        if err != nil {
                if uErr, ok := err.(*url.Error); ok {
                        if dErr, ok := uErr.Err.(*net.DNSError); ok && !dErr.IsTimeout {
                                return nil, errdefs.NotFound(err)
                        }
                }
                return nil, errdefs.System(err)
        }
        if resp.StatusCode < http.StatusBadRequest {
                return resp, nil
        }
        msg := fmt.Sprintf("failed to GET %s with status %s", address, resp.Status)
        body, err := io.ReadAll(resp.Body)
        _ = resp.Body.Close()
        if err != nil {
                return nil, errdefs.System(errors.New(msg + ": error reading body"))
        }

        msg += ": " + string(bytes.TrimSpace(body))
        switch resp.StatusCode {
        case http.StatusNotFound:
                return nil, errdefs.NotFound(errors.New(msg))
        case http.StatusBadRequest:
                return nil, errdefs.InvalidParameter(errors.New(msg))
        case http.StatusUnauthorized:
                return nil, errdefs.Unauthorized(errors.New(msg))
        case http.StatusForbidden:
                return nil, errdefs.Forbidden(errors.New(msg))
        default:
                return nil, errdefs.Unknown(errors.New(msg))
        }
}

// inspectResponse looks into the http response data at r to determine whether its
// content-type is on the list of acceptable content types for remote build contexts.
// This function returns:
//   - a string representation of the detected content-type
//   - an io.Reader for the response body
//   - an error value which will be non-nil either when something goes wrong while
//     reading bytes from r or when the detected content-type is not acceptable.
func inspectResponse(ct string, r io.Reader, clen int64) (string, io.Reader, error) {
        plen := clen
        if plen <= 0 || plen > maxPreambleLength {
                plen = maxPreambleLength
        }

        preamble := make([]byte, plen)
        rlen, err := r.Read(preamble)
        if rlen == 0 {
                return ct, r, errors.New("empty response")
        }
        if err != nil && !errors.Is(err, io.EOF) {
                return ct, r, err
        }

        preambleR := bytes.NewReader(preamble[:rlen])
        bodyReader := io.MultiReader(preambleR, r)
        // Some web servers will use application/octet-stream as the default
        // content type for files without an extension (e.g. 'Dockerfile')
        // so if we receive this value we better check for text content
        contentType := ct
        if ct == "" || ct == mimeTypeOctetStream {
                contentType, err = detectContentType(preamble)
                if err != nil {
                        return contentType, bodyReader, err
                }
        }

        contentType = selectAcceptableMIME(contentType)
        var cterr error
        if contentType == "" {
                cterr = fmt.Errorf("unsupported Content-Type %q", ct)
                contentType = ct
        }

        return contentType, bodyReader, cterr
}

func selectAcceptableMIME(ct string) string {
        return mimeRe.FindString(ct)
}

// Copyright 2022 Google LLC. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package remotecontext

import (
        "bytes"
        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

func FuzzreadAndParseDockerfile(data []byte) int {
        f := fuzz.NewConsumer(data)
        name, err := f.GetString()
        if err != nil {
                return 0
        }
        rcBytes, err := f.GetBytes()
        if err != nil {
                return 0
        }
        rc := bytes.NewReader(rcBytes)
        _, _ = readAndParseDockerfile(name, rc)
        return 1
}

// Package urlutil provides helper function to check if a given build-context
// location should be considered a URL or a remote Git repository.
//
// This package is specifically written for use with docker build contexts, and
// should not be used as a general-purpose utility.
package urlutil

import (
        "strings"

        "github.com/docker/docker/internal/lazyregexp"
)

// urlPathWithFragmentSuffix matches fragments to use as Git reference and build
// context from the Git repository. See IsGitURL for details.
var urlPathWithFragmentSuffix = lazyregexp.New(`\.git(?:#.+)?$`)

// IsURL returns true if the provided str is an HTTP(S) URL by checking if it
// has a http:// or https:// scheme. No validation is performed to verify if the
// URL is well-formed.
func IsURL(str string) bool {
        return strings.HasPrefix(str, "https://") || strings.HasPrefix(str, "http://")
}

// IsGitURL returns true if the provided str is a remote git repository "URL".
//
// This function only performs a rudimentary check (no validation is performed
// to ensure the URL is well-formed), and is written specifically for use with
// docker build, with some logic for backward compatibility with older versions
// of docker: do not use this function as a general-purpose utility.
//
// The following patterns are considered to be a Git URL:
//
//   - https://(.*).git(?:#.+)?$  git repository URL with optional fragment, as known to be used by GitHub and GitLab.
//   - http://(.*).git(?:#.+)?$   same, but non-TLS
//   - git://(.*)                 URLs using git:// scheme
//   - git@(.*)
//   - github.com/                see description below
//
// The github.com/ prefix is a special case used to treat context-paths
// starting with "github.com/" as a git URL if the given path does not
// exist locally. The "github.com/" prefix is kept for backward compatibility,
// and is a legacy feature.
//
// Going forward, no additional prefixes should be added, and users should
// be encouraged to use explicit URLs (https://github.com/user/repo.git) instead.
//
// Note that IsGitURL does not check if "github.com/" prefixes exist as a local
// path. Code using this function should check if the path exists locally before
// using it as a URL.
//
// # Fragments
//
// Git URLs accept context configuration in their fragment section, separated by
// a colon (`:`). The first part represents the reference to check out, and can
// be either a branch, a tag, or a remote reference. The second part represents
// a subdirectory inside the repository to use as the build context.
//
// For example,the following URL uses a directory named "docker" in the branch
// "container" in the https://github.com/myorg/my-repo.git repository:
//
// https://github.com/myorg/my-repo.git#container:docker
//
// The following table represents all the valid suffixes with their build
// contexts:
//
// | Build Syntax Suffix            | Git reference used   | Build Context Used |
// |--------------------------------|----------------------|--------------------|
// | my-repo.git                    | refs/heads/master    | /                  |
// | my-repo.git#mytag              | refs/tags/my-tag     | /                  |
// | my-repo.git#mybranch           | refs/heads/my-branch | /                  |
// | my-repo.git#pull/42/head       | refs/pull/42/head    | /                  |
// | my-repo.git#:directory         | refs/heads/master    | /directory         |
// | my-repo.git#master:directory   | refs/heads/master    | /directory         |
// | my-repo.git#mytag:directory    | refs/tags/my-tag     | /directory         |
// | my-repo.git#mybranch:directory | refs/heads/my-branch | /directory         |
func IsGitURL(str string) bool {
        if IsURL(str) && urlPathWithFragmentSuffix.MatchString(str) {
                return true
        }
        for _, prefix := range []string{"git://", "github.com/", "git@"} {
                if strings.HasPrefix(str, prefix) {
                        return true
                }
        }
        return false
}

package convert

import (
        gogotypes "github.com/gogo/protobuf/types"
        swarmtypes "github.com/moby/moby/api/types/swarm"
        swarmapi "github.com/moby/swarmkit/v2/api"
)

// ConfigFromGRPC converts a grpc Config to a Config.
func ConfigFromGRPC(s *swarmapi.Config) swarmtypes.Config {
        config := swarmtypes.Config{
                ID: s.ID,
                Spec: swarmtypes.ConfigSpec{
                        Annotations: annotationsFromGRPC(s.Spec.Annotations),
                        Data:        s.Spec.Data,
                },
        }

        config.Version.Index = s.Meta.Version.Index
        // Meta
        config.CreatedAt, _ = gogotypes.TimestampFromProto(s.Meta.CreatedAt)
        config.UpdatedAt, _ = gogotypes.TimestampFromProto(s.Meta.UpdatedAt)

        if s.Spec.Templating != nil {
                config.Spec.Templating = &swarmtypes.Driver{
                        Name:    s.Spec.Templating.Name,
                        Options: s.Spec.Templating.Options,
                }
        }

        return config
}

// ConfigSpecToGRPC converts Config to a grpc Config.
func ConfigSpecToGRPC(s swarmtypes.ConfigSpec) swarmapi.ConfigSpec {
        spec := swarmapi.ConfigSpec{
                Annotations: swarmapi.Annotations{
                        Name:   s.Name,
                        Labels: s.Labels,
                },
                Data: s.Data,
        }

        if s.Templating != nil {
                spec.Templating = &swarmapi.Driver{
                        Name:    s.Templating.Name,
                        Options: s.Templating.Options,
                }
        }

        return spec
}

// ConfigReferencesFromGRPC converts a slice of grpc ConfigReference to ConfigReference
func ConfigReferencesFromGRPC(s []*swarmapi.ConfigReference) []*swarmtypes.ConfigReference {
        refs := []*swarmtypes.ConfigReference{}

        for _, r := range s {
                ref := &swarmtypes.ConfigReference{
                        ConfigID:   r.ConfigID,
                        ConfigName: r.ConfigName,
                }

                if t, ok := r.Target.(*swarmapi.ConfigReference_File); ok {
                        ref.File = &swarmtypes.ConfigReferenceFileTarget{
                                Name: t.File.Name,
                                UID:  t.File.UID,
                                GID:  t.File.GID,
                                Mode: t.File.Mode,
                        }
                }

                refs = append(refs, ref)
        }

        return refs
}

package convert

import (
        "context"
        "encoding/json"
        "fmt"
        "strings"

        "github.com/containerd/log"
        gogotypes "github.com/gogo/protobuf/types"
        "github.com/moby/moby/api/types/container"
        mounttypes "github.com/moby/moby/api/types/mount"
        types "github.com/moby/moby/api/types/swarm"
        swarmapi "github.com/moby/swarmkit/v2/api"
        "github.com/pkg/errors"
)

func containerSpecFromGRPC(c *swarmapi.ContainerSpec) *types.ContainerSpec {
        if c == nil {
                return nil
        }
        containerSpec := &types.ContainerSpec{
                Image:          c.Image,
                Labels:         c.Labels,
                Command:        c.Command,
                Args:           c.Args,
                Hostname:       c.Hostname,
                Env:            c.Env,
                Dir:            c.Dir,
                User:           c.User,
                Groups:         c.Groups,
                StopSignal:     c.StopSignal,
                TTY:            c.TTY,
                OpenStdin:      c.OpenStdin,
                ReadOnly:       c.ReadOnly,
                Hosts:          c.Hosts,
                Secrets:        secretReferencesFromGRPC(c.Secrets),
                Configs:        configReferencesFromGRPC(c.Configs),
                Isolation:      IsolationFromGRPC(c.Isolation),
                Init:           initFromGRPC(c.Init),
                Sysctls:        c.Sysctls,
                CapabilityAdd:  c.CapabilityAdd,
                CapabilityDrop: c.CapabilityDrop,
                Ulimits:        ulimitsFromGRPC(c.Ulimits),
                OomScoreAdj:    c.OomScoreAdj,
        }

        if c.DNSConfig != nil {
                containerSpec.DNSConfig = &types.DNSConfig{
                        Nameservers: c.DNSConfig.Nameservers,
                        Search:      c.DNSConfig.Search,
                        Options:     c.DNSConfig.Options,
                }
        }

        // Privileges
        if c.Privileges != nil {
                containerSpec.Privileges = &types.Privileges{}

                if c.Privileges.CredentialSpec != nil {
                        containerSpec.Privileges.CredentialSpec = credentialSpecFromGRPC(c.Privileges.CredentialSpec)
                }

                if c.Privileges.SELinuxContext != nil {
                        containerSpec.Privileges.SELinuxContext = &types.SELinuxContext{
                                Disable: c.Privileges.SELinuxContext.Disable,
                                User:    c.Privileges.SELinuxContext.User,
                                Type:    c.Privileges.SELinuxContext.Type,
                                Role:    c.Privileges.SELinuxContext.Role,
                                Level:   c.Privileges.SELinuxContext.Level,
                        }
                }

                if c.Privileges.Seccomp != nil {
                        containerSpec.Privileges.Seccomp = &types.SeccompOpts{
                                Profile: c.Privileges.Seccomp.Profile,
                        }

                        switch c.Privileges.Seccomp.Mode {
                        case swarmapi.Privileges_SeccompOpts_DEFAULT:
                                containerSpec.Privileges.Seccomp.Mode = types.SeccompModeDefault
                        case swarmapi.Privileges_SeccompOpts_UNCONFINED:
                                containerSpec.Privileges.Seccomp.Mode = types.SeccompModeUnconfined
                        case swarmapi.Privileges_SeccompOpts_CUSTOM:
                                containerSpec.Privileges.Seccomp.Mode = types.SeccompModeCustom
                        }
                }

                if c.Privileges.Apparmor != nil {
                        containerSpec.Privileges.AppArmor = &types.AppArmorOpts{}

                        switch c.Privileges.Apparmor.Mode {
                        case swarmapi.Privileges_AppArmorOpts_DEFAULT:
                                containerSpec.Privileges.AppArmor.Mode = types.AppArmorModeDefault
                        case swarmapi.Privileges_AppArmorOpts_DISABLED:
                                containerSpec.Privileges.AppArmor.Mode = types.AppArmorModeDisabled
                        }
                }

                containerSpec.Privileges.NoNewPrivileges = c.Privileges.NoNewPrivileges
        }

        // Mounts
        for _, m := range c.Mounts {
                mount := mounttypes.Mount{
                        Target:   m.Target,
                        Source:   m.Source,
                        Type:     mounttypes.Type(strings.ToLower(swarmapi.Mount_MountType_name[int32(m.Type)])),
                        ReadOnly: m.ReadOnly,
                }

                if m.BindOptions != nil {
                        mount.BindOptions = &mounttypes.BindOptions{
                                Propagation:            mounttypes.Propagation(strings.ToLower(swarmapi.Mount_BindOptions_MountPropagation_name[int32(m.BindOptions.Propagation)])),
                                NonRecursive:           m.BindOptions.NonRecursive,
                                CreateMountpoint:       m.BindOptions.CreateMountpoint,
                                ReadOnlyNonRecursive:   m.BindOptions.ReadOnlyNonRecursive,
                                ReadOnlyForceRecursive: m.BindOptions.ReadOnlyForceRecursive,
                        }
                }

                if m.VolumeOptions != nil {
                        mount.VolumeOptions = &mounttypes.VolumeOptions{
                                NoCopy:  m.VolumeOptions.NoCopy,
                                Labels:  m.VolumeOptions.Labels,
                                Subpath: m.VolumeOptions.Subpath,
                        }
                        if m.VolumeOptions.DriverConfig != nil {
                                mount.VolumeOptions.DriverConfig = &mounttypes.Driver{
                                        Name:    m.VolumeOptions.DriverConfig.Name,
                                        Options: m.VolumeOptions.DriverConfig.Options,
                                }
                        }
                }

                if m.TmpfsOptions != nil {
                        mount.TmpfsOptions = &mounttypes.TmpfsOptions{
                                SizeBytes: m.TmpfsOptions.SizeBytes,
                                Mode:      m.TmpfsOptions.Mode,
                                Options:   tmpfsOptionsFromGRPC(m.TmpfsOptions.Options),
                        }
                }
                containerSpec.Mounts = append(containerSpec.Mounts, mount)
        }

        if c.StopGracePeriod != nil {
                grace, _ := gogotypes.DurationFromProto(c.StopGracePeriod)
                containerSpec.StopGracePeriod = &grace
        }

        if c.Healthcheck != nil {
                containerSpec.Healthcheck = healthConfigFromGRPC(c.Healthcheck)
        }

        return containerSpec
}

func initFromGRPC(v *gogotypes.BoolValue) *bool {
        if v == nil {
                return nil
        }
        value := v.GetValue()
        return &value
}

func initToGRPC(v *bool) *gogotypes.BoolValue {
        if v == nil {
                return nil
        }
        return &gogotypes.BoolValue{Value: *v}
}

func secretReferencesToGRPC(sr []*types.SecretReference) []*swarmapi.SecretReference {
        refs := make([]*swarmapi.SecretReference, 0, len(sr))
        for _, s := range sr {
                ref := &swarmapi.SecretReference{
                        SecretID:   s.SecretID,
                        SecretName: s.SecretName,
                }
                if s.File != nil {
                        ref.Target = &swarmapi.SecretReference_File{
                                File: &swarmapi.FileTarget{
                                        Name: s.File.Name,
                                        UID:  s.File.UID,
                                        GID:  s.File.GID,
                                        Mode: s.File.Mode,
                                },
                        }
                }

                refs = append(refs, ref)
        }

        return refs
}

func secretReferencesFromGRPC(sr []*swarmapi.SecretReference) []*types.SecretReference {
        refs := make([]*types.SecretReference, 0, len(sr))
        for _, s := range sr {
                target := s.GetFile()
                if target == nil {
                        // not a file target
                        log.G(context.TODO()).Warnf("secret target not a file: secret=%s", s.SecretID)
                        continue
                }
                refs = append(refs, &types.SecretReference{
                        File: &types.SecretReferenceFileTarget{
                                Name: target.Name,
                                UID:  target.UID,
                                GID:  target.GID,
                                Mode: target.Mode,
                        },
                        SecretID:   s.SecretID,
                        SecretName: s.SecretName,
                })
        }

        return refs
}

func configReferencesToGRPC(sr []*types.ConfigReference) ([]*swarmapi.ConfigReference, error) {
        refs := make([]*swarmapi.ConfigReference, 0, len(sr))
        for _, s := range sr {
                ref := &swarmapi.ConfigReference{
                        ConfigID:   s.ConfigID,
                        ConfigName: s.ConfigName,
                }
                switch {
                case s.Runtime == nil && s.File == nil:
                        return nil, errors.New("either File or Runtime should be set")
                case s.Runtime != nil && s.File != nil:
                        return nil, errors.New("cannot specify both File and Runtime")
                case s.Runtime != nil:
                        // Runtime target was added in API v1.40 and takes precedence over
                        // File target. However, File and Runtime targets are mutually exclusive,
                        // so we should never have both.
                        ref.Target = &swarmapi.ConfigReference_Runtime{
                                Runtime: &swarmapi.RuntimeTarget{},
                        }
                case s.File != nil:
                        ref.Target = &swarmapi.ConfigReference_File{
                                File: &swarmapi.FileTarget{
                                        Name: s.File.Name,
                                        UID:  s.File.UID,
                                        GID:  s.File.GID,
                                        Mode: s.File.Mode,
                                },
                        }
                }

                refs = append(refs, ref)
        }

        return refs, nil
}

func configReferencesFromGRPC(sr []*swarmapi.ConfigReference) []*types.ConfigReference {
        refs := make([]*types.ConfigReference, 0, len(sr))
        for _, s := range sr {
                r := &types.ConfigReference{
                        ConfigID:   s.ConfigID,
                        ConfigName: s.ConfigName,
                }
                if target := s.GetRuntime(); target != nil {
                        r.Runtime = &types.ConfigReferenceRuntimeTarget{}
                } else if target := s.GetFile(); target != nil {
                        r.File = &types.ConfigReferenceFileTarget{
                                Name: target.Name,
                                UID:  target.UID,
                                GID:  target.GID,
                                Mode: target.Mode,
                        }
                } else {
                        // not a file target
                        log.G(context.TODO()).Warnf("config target not known: config=%s", s.ConfigID)
                        continue
                }
                refs = append(refs, r)
        }

        return refs
}

func containerToGRPC(c *types.ContainerSpec) (*swarmapi.ContainerSpec, error) {
        containerSpec := &swarmapi.ContainerSpec{
                Image:          c.Image,
                Labels:         c.Labels,
                Command:        c.Command,
                Args:           c.Args,
                Hostname:       c.Hostname,
                Env:            c.Env,
                Dir:            c.Dir,
                User:           c.User,
                Groups:         c.Groups,
                StopSignal:     c.StopSignal,
                TTY:            c.TTY,
                OpenStdin:      c.OpenStdin,
                ReadOnly:       c.ReadOnly,
                Hosts:          c.Hosts,
                Secrets:        secretReferencesToGRPC(c.Secrets),
                Isolation:      isolationToGRPC(c.Isolation),
                Init:           initToGRPC(c.Init),
                Sysctls:        c.Sysctls,
                CapabilityAdd:  c.CapabilityAdd,
                CapabilityDrop: c.CapabilityDrop,
                Ulimits:        ulimitsToGRPC(c.Ulimits),
                OomScoreAdj:    c.OomScoreAdj,
        }

        if c.DNSConfig != nil {
                containerSpec.DNSConfig = &swarmapi.ContainerSpec_DNSConfig{
                        Nameservers: c.DNSConfig.Nameservers,
                        Search:      c.DNSConfig.Search,
                        Options:     c.DNSConfig.Options,
                }
        }

        if c.StopGracePeriod != nil {
                containerSpec.StopGracePeriod = gogotypes.DurationProto(*c.StopGracePeriod)
        }

        // Privileges
        if c.Privileges != nil {
                containerSpec.Privileges = &swarmapi.Privileges{}

                if c.Privileges.CredentialSpec != nil {
                        cs, err := credentialSpecToGRPC(c.Privileges.CredentialSpec)
                        if err != nil {
                                return nil, errors.Wrap(err, "invalid CredentialSpec")
                        }
                        containerSpec.Privileges.CredentialSpec = cs
                }

                if c.Privileges.SELinuxContext != nil {
                        containerSpec.Privileges.SELinuxContext = &swarmapi.Privileges_SELinuxContext{
                                Disable: c.Privileges.SELinuxContext.Disable,
                                User:    c.Privileges.SELinuxContext.User,
                                Type:    c.Privileges.SELinuxContext.Type,
                                Role:    c.Privileges.SELinuxContext.Role,
                                Level:   c.Privileges.SELinuxContext.Level,
                        }
                }

                if c.Privileges.Seccomp != nil {
                        containerSpec.Privileges.Seccomp = &swarmapi.Privileges_SeccompOpts{
                                Profile: c.Privileges.Seccomp.Profile,
                        }

                        switch c.Privileges.Seccomp.Mode {
                        case types.SeccompModeDefault:
                                containerSpec.Privileges.Seccomp.Mode = swarmapi.Privileges_SeccompOpts_DEFAULT
                        case types.SeccompModeUnconfined:
                                containerSpec.Privileges.Seccomp.Mode = swarmapi.Privileges_SeccompOpts_UNCONFINED
                        case types.SeccompModeCustom:
                                containerSpec.Privileges.Seccomp.Mode = swarmapi.Privileges_SeccompOpts_CUSTOM
                        }
                }

                if c.Privileges.AppArmor != nil {
                        containerSpec.Privileges.Apparmor = &swarmapi.Privileges_AppArmorOpts{}

                        switch c.Privileges.AppArmor.Mode {
                        case types.AppArmorModeDefault:
                                containerSpec.Privileges.Apparmor.Mode = swarmapi.Privileges_AppArmorOpts_DEFAULT
                        case types.AppArmorModeDisabled:
                                containerSpec.Privileges.Apparmor.Mode = swarmapi.Privileges_AppArmorOpts_DISABLED
                        }
                }

                containerSpec.Privileges.NoNewPrivileges = c.Privileges.NoNewPrivileges
        }

        if c.Configs != nil {
                configs, err := configReferencesToGRPC(c.Configs)
                if err != nil {
                        return nil, errors.Wrap(err, "invalid Config")
                }
                containerSpec.Configs = configs
        }

        // Mounts
        for _, m := range c.Mounts {
                mount := swarmapi.Mount{
                        Target:   m.Target,
                        Source:   m.Source,
                        ReadOnly: m.ReadOnly,
                }

                if mountType, ok := swarmapi.Mount_MountType_value[strings.ToUpper(string(m.Type))]; ok {
                        mount.Type = swarmapi.Mount_MountType(mountType)
                } else if string(m.Type) != "" {
                        return nil, fmt.Errorf("invalid MountType: %q", m.Type)
                }

                if m.BindOptions != nil {
                        if mountPropagation, ok := swarmapi.Mount_BindOptions_MountPropagation_value[strings.ToUpper(string(m.BindOptions.Propagation))]; ok {
                                mount.BindOptions = &swarmapi.Mount_BindOptions{Propagation: swarmapi.Mount_BindOptions_MountPropagation(mountPropagation)}
                        } else if string(m.BindOptions.Propagation) != "" {
                                return nil, fmt.Errorf("invalid MountPropagation: %q", m.BindOptions.Propagation)
                        }

                        if m.BindOptions.NonRecursive {
                                if mount.BindOptions == nil {
                                        // the propagation defaults to rprivate
                                        mount.BindOptions = &swarmapi.Mount_BindOptions{}
                                }
                                mount.BindOptions.NonRecursive = m.BindOptions.NonRecursive
                        }
                }

                if m.VolumeOptions != nil {
                        mount.VolumeOptions = &swarmapi.Mount_VolumeOptions{
                                NoCopy:  m.VolumeOptions.NoCopy,
                                Labels:  m.VolumeOptions.Labels,
                                Subpath: m.VolumeOptions.Subpath,
                        }
                        if m.VolumeOptions.DriverConfig != nil {
                                mount.VolumeOptions.DriverConfig = &swarmapi.Driver{
                                        Name:    m.VolumeOptions.DriverConfig.Name,
                                        Options: m.VolumeOptions.DriverConfig.Options,
                                }
                        }
                }

                if m.TmpfsOptions != nil {
                        mount.TmpfsOptions = &swarmapi.Mount_TmpfsOptions{
                                SizeBytes: m.TmpfsOptions.SizeBytes,
                                Mode:      m.TmpfsOptions.Mode,
                                Options:   tmpfsOptionsToGRPC(m.TmpfsOptions.Options),
                        }
                }

                containerSpec.Mounts = append(containerSpec.Mounts, mount)
        }

        if c.Healthcheck != nil {
                containerSpec.Healthcheck = healthConfigToGRPC(c.Healthcheck)
        }

        return containerSpec, nil
}

func credentialSpecFromGRPC(c *swarmapi.Privileges_CredentialSpec) *types.CredentialSpec {
        cs := &types.CredentialSpec{}
        switch c.Source.(type) {
        case *swarmapi.Privileges_CredentialSpec_Config:
                cs.Config = c.GetConfig()
        case *swarmapi.Privileges_CredentialSpec_File:
                cs.File = c.GetFile()
        case *swarmapi.Privileges_CredentialSpec_Registry:
                cs.Registry = c.GetRegistry()
        }
        return cs
}

func credentialSpecToGRPC(c *types.CredentialSpec) (*swarmapi.Privileges_CredentialSpec, error) {
        var opts []string

        if c.Config != "" {
                opts = append(opts, `"config"`)
        }
        if c.File != "" {
                opts = append(opts, `"file"`)
        }
        if c.Registry != "" {
                opts = append(opts, `"registry"`)
        }
        l := len(opts)
        switch {
        case l == 0:
                return nil, errors.New(`must either provide "file", "registry", or "config" for credential spec`)
        case l == 2:
                return nil, fmt.Errorf("cannot specify both %s and %s credential specs", opts[0], opts[1])
        case l > 2:
                return nil, fmt.Errorf("cannot specify both %s, and %s credential specs", strings.Join(opts[:l-1], ", "), opts[l-1])
        }

        spec := &swarmapi.Privileges_CredentialSpec{}
        switch {
        case c.Config != "":
                spec.Source = &swarmapi.Privileges_CredentialSpec_Config{
                        Config: c.Config,
                }
        case c.File != "":
                spec.Source = &swarmapi.Privileges_CredentialSpec_File{
                        File: c.File,
                }
        case c.Registry != "":
                spec.Source = &swarmapi.Privileges_CredentialSpec_Registry{
                        Registry: c.Registry,
                }
        }

        return spec, nil
}

func healthConfigFromGRPC(h *swarmapi.HealthConfig) *container.HealthConfig {
        interval, _ := gogotypes.DurationFromProto(h.Interval)
        timeout, _ := gogotypes.DurationFromProto(h.Timeout)
        startPeriod, _ := gogotypes.DurationFromProto(h.StartPeriod)
        startInterval, _ := gogotypes.DurationFromProto(h.StartInterval)
        return &container.HealthConfig{
                Test:          h.Test,
                Interval:      interval,
                Timeout:       timeout,
                Retries:       int(h.Retries),
                StartPeriod:   startPeriod,
                StartInterval: startInterval,
        }
}

func healthConfigToGRPC(h *container.HealthConfig) *swarmapi.HealthConfig {
        return &swarmapi.HealthConfig{
                Test:          h.Test,
                Interval:      gogotypes.DurationProto(h.Interval),
                Timeout:       gogotypes.DurationProto(h.Timeout),
                Retries:       int32(h.Retries),
                StartPeriod:   gogotypes.DurationProto(h.StartPeriod),
                StartInterval: gogotypes.DurationProto(h.StartInterval),
        }
}

// IsolationFromGRPC converts a swarm api container isolation to a moby isolation representation
func IsolationFromGRPC(i swarmapi.ContainerSpec_Isolation) container.Isolation {
        switch i {
        case swarmapi.ContainerIsolationHyperV:
                return container.IsolationHyperV
        case swarmapi.ContainerIsolationProcess:
                return container.IsolationProcess
        case swarmapi.ContainerIsolationDefault:
                return container.IsolationDefault
        }
        return container.IsolationEmpty
}

func isolationToGRPC(i container.Isolation) swarmapi.ContainerSpec_Isolation {
        if i.IsHyperV() {
                return swarmapi.ContainerIsolationHyperV
        }
        if i.IsProcess() {
                return swarmapi.ContainerIsolationProcess
        }
        return swarmapi.ContainerIsolationDefault
}

func ulimitsFromGRPC(u []*swarmapi.ContainerSpec_Ulimit) []*container.Ulimit {
        ulimits := make([]*container.Ulimit, len(u))

        for i, ulimit := range u {
                ulimits[i] = &container.Ulimit{
                        Name: ulimit.Name,
                        Soft: ulimit.Soft,
                        Hard: ulimit.Hard,
                }
        }

        return ulimits
}

func ulimitsToGRPC(u []*container.Ulimit) []*swarmapi.ContainerSpec_Ulimit {
        ulimits := make([]*swarmapi.ContainerSpec_Ulimit, len(u))

        for i, ulimit := range u {
                ulimits[i] = &swarmapi.ContainerSpec_Ulimit{
                        Name: ulimit.Name,
                        Soft: ulimit.Soft,
                        Hard: ulimit.Hard,
                }
        }

        return ulimits
}

func tmpfsOptionsToGRPC(options [][]string) string {
        // The shape of the swarmkit API that tmpfs options are a string. The shape
        // of the docker API has them as a more structured array of arrays of
        // strings. To smooth this over, we will marshall the array-of-arrays to
        // json then pass that as the string.

        // Marshalling json can create an error, but only in specific cases which
        // are not relevant. We can ignore the possibility.
        jsonBytes, _ := json.Marshal(options) //nolint:errchkjson // ignoring errors, as described above
        return string(jsonBytes)
}

func tmpfsOptionsFromGRPC(options string) [][]string {
        // See tmpfsOptionsToGRPC for the reasoning. We undo what we did.
        var unstring [][]string
        // We can't return errors from here, so just don't ever pass anything that
        // could result in an error.
        //
        // Duh.
        //
        // If there is something erroneous, then an empty return value will result,
        // which should not be catastrophic. Because we control the data that is
        // marshalled (in tmpfsOptionsToGRPC), we can more-or-less ensure that only
        // valid data is unmarshalled here. If someone does something like muck
        // with the GRPC API directly, then they get footgun, no apologies.
        _ = json.Unmarshal([]byte(options), &unstring)
        return unstring
}

package convert

import (
        "strings"

        "github.com/docker/docker/daemon/libnetwork/scope"
        gogotypes "github.com/gogo/protobuf/types"
        "github.com/moby/moby/api/types/network"
        types "github.com/moby/moby/api/types/swarm"
        swarmapi "github.com/moby/swarmkit/v2/api"
)

func networkAttachmentFromGRPC(na *swarmapi.NetworkAttachment) types.NetworkAttachment {
        if na != nil {
                return types.NetworkAttachment{
                        Network:   networkFromGRPC(na.Network),
                        Addresses: na.Addresses,
                }
        }
        return types.NetworkAttachment{}
}

func networkFromGRPC(n *swarmapi.Network) types.Network {
        if n != nil {
                nw := types.Network{
                        ID: n.ID,
                        Spec: types.NetworkSpec{
                                IPv6Enabled: n.Spec.Ipv6Enabled,
                                Internal:    n.Spec.Internal,
                                Attachable:  n.Spec.Attachable,
                                Ingress:     IsIngressNetwork(n),
                                IPAMOptions: ipamFromGRPC(n.Spec.IPAM),
                                Scope:       scope.Swarm,
                        },
                        IPAMOptions: ipamFromGRPC(n.IPAM),
                }

                if n.Spec.GetNetwork() != "" {
                        nw.Spec.ConfigFrom = &network.ConfigReference{
                                Network: n.Spec.GetNetwork(),
                        }
                }

                // Meta
                nw.Version.Index = n.Meta.Version.Index
                nw.CreatedAt, _ = gogotypes.TimestampFromProto(n.Meta.CreatedAt)
                nw.UpdatedAt, _ = gogotypes.TimestampFromProto(n.Meta.UpdatedAt)

                // Annotations
                nw.Spec.Annotations = annotationsFromGRPC(n.Spec.Annotations)

                // DriverConfiguration
                if n.Spec.DriverConfig != nil {
                        nw.Spec.DriverConfiguration = &types.Driver{
                                Name:    n.Spec.DriverConfig.Name,
                                Options: n.Spec.DriverConfig.Options,
                        }
                }

                // DriverState
                if n.DriverState != nil {
                        nw.DriverState = types.Driver{
                                Name:    n.DriverState.Name,
                                Options: n.DriverState.Options,
                        }
                }

                return nw
        }
        return types.Network{}
}

func ipamFromGRPC(i *swarmapi.IPAMOptions) *types.IPAMOptions {
        var ipam *types.IPAMOptions
        if i != nil {
                ipam = &types.IPAMOptions{}
                if i.Driver != nil {
                        ipam.Driver.Name = i.Driver.Name
                        ipam.Driver.Options = i.Driver.Options
                }

                for _, config := range i.Configs {
                        ipam.Configs = append(ipam.Configs, types.IPAMConfig{
                                Subnet:  config.Subnet,
                                Range:   config.Range,
                                Gateway: config.Gateway,
                        })
                }
        }
        return ipam
}

func endpointSpecFromGRPC(es *swarmapi.EndpointSpec) *types.EndpointSpec {
        var endpointSpec *types.EndpointSpec
        if es != nil {
                endpointSpec = &types.EndpointSpec{
                        Mode: types.ResolutionMode(strings.ToLower(es.Mode.String())),
                }

                for _, portState := range es.Ports {
                        endpointSpec.Ports = append(endpointSpec.Ports, swarmPortConfigToAPIPortConfig(portState))
                }
        }
        return endpointSpec
}

func endpointFromGRPC(e *swarmapi.Endpoint) types.Endpoint {
        endpoint := types.Endpoint{}
        if e != nil {
                if espec := endpointSpecFromGRPC(e.Spec); espec != nil {
                        endpoint.Spec = *espec
                }

                for _, portState := range e.Ports {
                        endpoint.Ports = append(endpoint.Ports, swarmPortConfigToAPIPortConfig(portState))
                }

                for _, v := range e.VirtualIPs {
                        endpoint.VirtualIPs = append(endpoint.VirtualIPs, types.EndpointVirtualIP{
                                NetworkID: v.NetworkID,
                                Addr:      v.Addr,
                        })
                }
        }

        return endpoint
}

func swarmPortConfigToAPIPortConfig(portConfig *swarmapi.PortConfig) types.PortConfig {
        return types.PortConfig{
                Name:          portConfig.Name,
                Protocol:      types.PortConfigProtocol(strings.ToLower(swarmapi.PortConfig_Protocol_name[int32(portConfig.Protocol)])),
                PublishMode:   types.PortConfigPublishMode(strings.ToLower(swarmapi.PortConfig_PublishMode_name[int32(portConfig.PublishMode)])),
                TargetPort:    portConfig.TargetPort,
                PublishedPort: portConfig.PublishedPort,
        }
}

// BasicNetworkFromGRPC converts a grpc Network to a NetworkResource.
func BasicNetworkFromGRPC(n swarmapi.Network) network.Inspect {
        spec := n.Spec
        var ipam network.IPAM
        if n.IPAM != nil {
                if n.IPAM.Driver != nil {
                        ipam.Driver = n.IPAM.Driver.Name
                        ipam.Options = n.IPAM.Driver.Options
                }
                ipam.Config = make([]network.IPAMConfig, 0, len(n.IPAM.Configs))
                for _, ic := range n.IPAM.Configs {
                        ipam.Config = append(ipam.Config, network.IPAMConfig{
                                Subnet:     ic.Subnet,
                                IPRange:    ic.Range,
                                Gateway:    ic.Gateway,
                                AuxAddress: ic.Reserved,
                        })
                }
        }

        nr := network.Inspect{
                ID:         n.ID,
                Name:       n.Spec.Annotations.Name,
                Scope:      scope.Swarm,
                EnableIPv4: true,
                EnableIPv6: spec.Ipv6Enabled,
                IPAM:       ipam,
                Internal:   spec.Internal,
                Attachable: spec.Attachable,
                Ingress:    IsIngressNetwork(&n),
                Labels:     n.Spec.Annotations.Labels,
        }
        nr.Created, _ = gogotypes.TimestampFromProto(n.Meta.CreatedAt)

        if n.Spec.GetNetwork() != "" {
                nr.ConfigFrom = network.ConfigReference{
                        Network: n.Spec.GetNetwork(),
                }
        }

        if n.DriverState != nil {
                nr.Driver = n.DriverState.Name
                nr.Options = n.DriverState.Options
        }

        return nr
}

// BasicNetworkCreateToGRPC converts a NetworkCreateRequest to a grpc NetworkSpec.
func BasicNetworkCreateToGRPC(create network.CreateRequest) swarmapi.NetworkSpec {
        ns := swarmapi.NetworkSpec{
                Annotations: swarmapi.Annotations{
                        Name:   create.Name,
                        Labels: create.Labels,
                },
                DriverConfig: &swarmapi.Driver{
                        Name:    create.Driver,
                        Options: create.Options,
                },
                Internal:   create.Internal,
                Attachable: create.Attachable,
                Ingress:    create.Ingress,
        }
        if create.EnableIPv6 != nil {
                ns.Ipv6Enabled = *create.EnableIPv6
        }
        if create.IPAM != nil {
                driver := create.IPAM.Driver
                if driver == "" {
                        driver = "default"
                }
                ns.IPAM = &swarmapi.IPAMOptions{
                        Driver: &swarmapi.Driver{
                                Name:    driver,
                                Options: create.IPAM.Options,
                        },
                }
                ipamSpec := make([]*swarmapi.IPAMConfig, 0, len(create.IPAM.Config))
                for _, ipamConfig := range create.IPAM.Config {
                        ipamSpec = append(ipamSpec, &swarmapi.IPAMConfig{
                                Subnet:  ipamConfig.Subnet,
                                Range:   ipamConfig.IPRange,
                                Gateway: ipamConfig.Gateway,
                        })
                }
                ns.IPAM.Configs = ipamSpec
        }
        if create.ConfigFrom != nil {
                ns.ConfigFrom = &swarmapi.NetworkSpec_Network{
                        Network: create.ConfigFrom.Network,
                }
        }
        return ns
}

// IsIngressNetwork check if the swarm network is an ingress network
func IsIngressNetwork(n *swarmapi.Network) bool {
        if n.Spec.Ingress {
                return true
        }
        // Check if legacy defined ingress network
        _, ok := n.Spec.Annotations.Labels["com.docker.swarm.internal"]
        return ok && n.Spec.Annotations.Name == "ingress"
}

package convert

import (
        "fmt"
        "strings"

        gogotypes "github.com/gogo/protobuf/types"
        types "github.com/moby/moby/api/types/swarm"
        swarmapi "github.com/moby/swarmkit/v2/api"
)

// NodeFromGRPC converts a grpc Node to a Node.
func NodeFromGRPC(n swarmapi.Node) types.Node {
        node := types.Node{
                ID: n.ID,
                Spec: types.NodeSpec{
                        Role:         types.NodeRole(strings.ToLower(n.Spec.DesiredRole.String())),
                        Availability: types.NodeAvailability(strings.ToLower(n.Spec.Availability.String())),
                },
                Status: types.NodeStatus{
                        State:   types.NodeState(strings.ToLower(n.Status.State.String())),
                        Message: n.Status.Message,
                        Addr:    n.Status.Addr,
                },
        }

        // Meta
        node.Version.Index = n.Meta.Version.Index
        node.CreatedAt, _ = gogotypes.TimestampFromProto(n.Meta.CreatedAt)
        node.UpdatedAt, _ = gogotypes.TimestampFromProto(n.Meta.UpdatedAt)

        // Annotations
        node.Spec.Annotations = annotationsFromGRPC(n.Spec.Annotations)

        // Description
        if n.Description != nil {
                node.Description.Hostname = n.Description.Hostname
                if n.Description.Platform != nil {
                        node.Description.Platform.Architecture = n.Description.Platform.Architecture
                        node.Description.Platform.OS = n.Description.Platform.OS
                }
                if n.Description.Resources != nil {
                        node.Description.Resources.NanoCPUs = n.Description.Resources.NanoCPUs
                        node.Description.Resources.MemoryBytes = n.Description.Resources.MemoryBytes
                        node.Description.Resources.GenericResources = GenericResourcesFromGRPC(n.Description.Resources.Generic)
                }
                if n.Description.Engine != nil {
                        node.Description.Engine.EngineVersion = n.Description.Engine.EngineVersion
                        node.Description.Engine.Labels = n.Description.Engine.Labels
                        for _, plugin := range n.Description.Engine.Plugins {
                                node.Description.Engine.Plugins = append(node.Description.Engine.Plugins, types.PluginDescription{Type: plugin.Type, Name: plugin.Name})
                        }
                }
                if n.Description.TLSInfo != nil {
                        node.Description.TLSInfo.TrustRoot = string(n.Description.TLSInfo.TrustRoot)
                        node.Description.TLSInfo.CertIssuerPublicKey = n.Description.TLSInfo.CertIssuerPublicKey
                        node.Description.TLSInfo.CertIssuerSubject = n.Description.TLSInfo.CertIssuerSubject
                }
                for _, csi := range n.Description.CSIInfo {
                        if csi != nil {
                                convertedInfo := types.NodeCSIInfo{
                                        PluginName:        csi.PluginName,
                                        NodeID:            csi.NodeID,
                                        MaxVolumesPerNode: csi.MaxVolumesPerNode,
                                }

                                if csi.AccessibleTopology != nil {
                                        convertedInfo.AccessibleTopology = &types.Topology{
                                                Segments: csi.AccessibleTopology.Segments,
                                        }
                                }

                                node.Description.CSIInfo = append(
                                        node.Description.CSIInfo, convertedInfo,
                                )
                        }
                }
        }

        // Manager
        if n.ManagerStatus != nil {
                node.ManagerStatus = &types.ManagerStatus{
                        Leader:       n.ManagerStatus.Leader,
                        Reachability: types.Reachability(strings.ToLower(n.ManagerStatus.Reachability.String())),
                        Addr:         n.ManagerStatus.Addr,
                }
        }

        return node
}

// NodeSpecToGRPC converts a NodeSpec to a grpc NodeSpec.
func NodeSpecToGRPC(s types.NodeSpec) (swarmapi.NodeSpec, error) {
        spec := swarmapi.NodeSpec{
                Annotations: swarmapi.Annotations{
                        Name:   s.Name,
                        Labels: s.Labels,
                },
        }
        if role, ok := swarmapi.NodeRole_value[strings.ToUpper(string(s.Role))]; ok {
                spec.DesiredRole = swarmapi.NodeRole(role)
        } else {
                return swarmapi.NodeSpec{}, fmt.Errorf("invalid Role: %q", s.Role)
        }

        if availability, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(s.Availability))]; ok {
                spec.Availability = swarmapi.NodeSpec_Availability(availability)
        } else {
                return swarmapi.NodeSpec{}, fmt.Errorf("invalid Availability: %q", s.Availability)
        }

        return spec, nil
}

package convert

import (
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/moby/swarmkit/v2/node/plugin"
)

// SwarmPluginGetter adapts a plugingetter.PluginGetter to a Swarmkit plugin.Getter.
func SwarmPluginGetter(pg plugingetter.PluginGetter) plugin.Getter {
        return pluginGetter{pg}
}

type pluginGetter struct {
        pg plugingetter.PluginGetter
}

var _ plugin.Getter = (*pluginGetter)(nil)

type swarmPlugin struct {
        plugingetter.CompatPlugin
}

func (p swarmPlugin) Client() plugin.Client {
        return p.CompatPlugin.Client()
}

type addrPlugin struct {
        plugingetter.CompatPlugin
        plugingetter.PluginAddr
}

var _ plugin.AddrPlugin = (*addrPlugin)(nil)

func (p addrPlugin) Client() plugin.Client {
        return p.CompatPlugin.Client()
}

func adaptPluginForSwarm(p plugingetter.CompatPlugin) plugin.Plugin {
        if pa, ok := p.(plugingetter.PluginAddr); ok {
                return addrPlugin{p, pa}
        }
        return swarmPlugin{p}
}

func (g pluginGetter) Get(name string, capability string) (plugin.Plugin, error) {
        p, err := g.pg.Get(name, capability, plugingetter.Lookup)
        if err != nil {
                return nil, err
        }
        return adaptPluginForSwarm(p), nil
}

func (g pluginGetter) GetAllManagedPluginsByCap(capability string) []plugin.Plugin {
        pp := g.pg.GetAllManagedPluginsByCap(capability)
        ret := make([]plugin.Plugin, len(pp))
        for i, p := range pp {
                ret[i] = adaptPluginForSwarm(p)
        }
        return ret
}

package convert

import (
        gogotypes "github.com/gogo/protobuf/types"
        swarmtypes "github.com/moby/moby/api/types/swarm"
        swarmapi "github.com/moby/swarmkit/v2/api"
)

// SecretFromGRPC converts a grpc Secret to a Secret.
func SecretFromGRPC(s *swarmapi.Secret) swarmtypes.Secret {
        secret := swarmtypes.Secret{
                ID: s.ID,
                Spec: swarmtypes.SecretSpec{
                        Annotations: annotationsFromGRPC(s.Spec.Annotations),
                        Data:        s.Spec.Data,
                        Driver:      driverFromGRPC(s.Spec.Driver),
                },
        }

        secret.Version.Index = s.Meta.Version.Index
        // Meta
        secret.CreatedAt, _ = gogotypes.TimestampFromProto(s.Meta.CreatedAt)
        secret.UpdatedAt, _ = gogotypes.TimestampFromProto(s.Meta.UpdatedAt)

        if s.Spec.Templating != nil {
                secret.Spec.Templating = &swarmtypes.Driver{
                        Name:    s.Spec.Templating.Name,
                        Options: s.Spec.Templating.Options,
                }
        }

        return secret
}

// SecretSpecToGRPC converts Secret to a grpc Secret.
func SecretSpecToGRPC(s swarmtypes.SecretSpec) swarmapi.SecretSpec {
        spec := swarmapi.SecretSpec{
                Annotations: swarmapi.Annotations{
                        Name:   s.Name,
                        Labels: s.Labels,
                },
                Data:   s.Data,
                Driver: driverToGRPC(s.Driver),
        }

        if s.Templating != nil {
                spec.Templating = &swarmapi.Driver{
                        Name:    s.Templating.Name,
                        Options: s.Templating.Options,
                }
        }

        return spec
}

// SecretReferencesFromGRPC converts a slice of grpc SecretReference to SecretReference
func SecretReferencesFromGRPC(s []*swarmapi.SecretReference) []*swarmtypes.SecretReference {
        refs := []*swarmtypes.SecretReference{}

        for _, r := range s {
                ref := &swarmtypes.SecretReference{
                        SecretID:   r.SecretID,
                        SecretName: r.SecretName,
                }

                if t, ok := r.Target.(*swarmapi.SecretReference_File); ok {
                        ref.File = &swarmtypes.SecretReferenceFileTarget{
                                Name: t.File.Name,
                                UID:  t.File.UID,
                                GID:  t.File.GID,
                                Mode: t.File.Mode,
                        }
                }

                refs = append(refs, ref)
        }

        return refs
}

package convert

import (
        "fmt"
        "strings"

        "github.com/docker/docker/pkg/namesgenerator"
        "github.com/gogo/protobuf/proto"
        gogotypes "github.com/gogo/protobuf/types"
        types "github.com/moby/moby/api/types/swarm"
        "github.com/moby/moby/api/types/swarm/runtime"
        swarmapi "github.com/moby/swarmkit/v2/api"
        "github.com/moby/swarmkit/v2/api/genericresource"
        "github.com/pkg/errors"
)

var (
        // ErrUnsupportedRuntime returns an error if the runtime is not supported by the daemon
        ErrUnsupportedRuntime = errors.New("unsupported runtime")
        // ErrMismatchedRuntime returns an error if the runtime does not match the provided spec
        ErrMismatchedRuntime = errors.New("mismatched Runtime and *Spec fields")
)

// ServiceFromGRPC converts a grpc Service to a Service.
func ServiceFromGRPC(s swarmapi.Service) (types.Service, error) {
        curSpec, err := serviceSpecFromGRPC(&s.Spec)
        if err != nil {
                return types.Service{}, err
        }
        prevSpec, err := serviceSpecFromGRPC(s.PreviousSpec)
        if err != nil {
                return types.Service{}, err
        }
        service := types.Service{
                ID:           s.ID,
                Spec:         *curSpec,
                PreviousSpec: prevSpec,

                Endpoint: endpointFromGRPC(s.Endpoint),
        }

        // Meta
        service.Version.Index = s.Meta.Version.Index
        service.CreatedAt, _ = gogotypes.TimestampFromProto(s.Meta.CreatedAt)
        service.UpdatedAt, _ = gogotypes.TimestampFromProto(s.Meta.UpdatedAt)

        if s.JobStatus != nil {
                service.JobStatus = &types.JobStatus{
                        JobIteration: types.Version{
                                Index: s.JobStatus.JobIteration.Index,
                        },
                }
                service.JobStatus.LastExecution, _ = gogotypes.TimestampFromProto(s.JobStatus.LastExecution)
        }

        // UpdateStatus
        if s.UpdateStatus != nil {
                service.UpdateStatus = &types.UpdateStatus{}
                switch s.UpdateStatus.State {
                case swarmapi.UpdateStatus_UPDATING:
                        service.UpdateStatus.State = types.UpdateStateUpdating
                case swarmapi.UpdateStatus_PAUSED:
                        service.UpdateStatus.State = types.UpdateStatePaused
                case swarmapi.UpdateStatus_COMPLETED:
                        service.UpdateStatus.State = types.UpdateStateCompleted
                case swarmapi.UpdateStatus_ROLLBACK_STARTED:
                        service.UpdateStatus.State = types.UpdateStateRollbackStarted
                case swarmapi.UpdateStatus_ROLLBACK_PAUSED:
                        service.UpdateStatus.State = types.UpdateStateRollbackPaused
                case swarmapi.UpdateStatus_ROLLBACK_COMPLETED:
                        service.UpdateStatus.State = types.UpdateStateRollbackCompleted
                default:
                        // TODO(thaJeztah): make switch exhaustive; add api.UpdateStatus_UNKNOWN
                }

                startedAt, _ := gogotypes.TimestampFromProto(s.UpdateStatus.StartedAt)
                if !startedAt.IsZero() && startedAt.Unix() != 0 {
                        service.UpdateStatus.StartedAt = &startedAt
                }

                completedAt, _ := gogotypes.TimestampFromProto(s.UpdateStatus.CompletedAt)
                if !completedAt.IsZero() && completedAt.Unix() != 0 {
                        service.UpdateStatus.CompletedAt = &completedAt
                }

                service.UpdateStatus.Message = s.UpdateStatus.Message
        }

        return service, nil
}

func serviceSpecFromGRPC(spec *swarmapi.ServiceSpec) (*types.ServiceSpec, error) {
        if spec == nil {
                return nil, nil
        }

        serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
        for _, n := range spec.Networks {
                netConfig := types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases, DriverOpts: n.DriverAttachmentOpts}
                serviceNetworks = append(serviceNetworks, netConfig)
        }

        taskTemplate, err := taskSpecFromGRPC(spec.Task)
        if err != nil {
                return nil, err
        }

        switch t := spec.Task.GetRuntime().(type) {
        case *swarmapi.TaskSpec_Container:
                containerConfig := t.Container
                taskTemplate.ContainerSpec = containerSpecFromGRPC(containerConfig)
                taskTemplate.Runtime = types.RuntimeContainer
        case *swarmapi.TaskSpec_Generic:
                switch t.Generic.Kind {
                case string(types.RuntimePlugin):
                        taskTemplate.Runtime = types.RuntimePlugin
                default:
                        return nil, fmt.Errorf("unknown task runtime type: %s", t.Generic.Payload.TypeUrl)
                }

        default:
                return nil, fmt.Errorf("error creating service; unsupported runtime %T", t)
        }

        convertedSpec := &types.ServiceSpec{
                Annotations:  annotationsFromGRPC(spec.Annotations),
                TaskTemplate: taskTemplate,
                Networks:     serviceNetworks,
                EndpointSpec: endpointSpecFromGRPC(spec.Endpoint),
        }

        // UpdateConfig
        convertedSpec.UpdateConfig = updateConfigFromGRPC(spec.Update)
        convertedSpec.RollbackConfig = updateConfigFromGRPC(spec.Rollback)

        // Mode
        switch t := spec.GetMode().(type) {
        case *swarmapi.ServiceSpec_Global:
                convertedSpec.Mode.Global = &types.GlobalService{}
        case *swarmapi.ServiceSpec_Replicated:
                convertedSpec.Mode.Replicated = &types.ReplicatedService{
                        Replicas: &t.Replicated.Replicas,
                }
        case *swarmapi.ServiceSpec_ReplicatedJob:
                convertedSpec.Mode.ReplicatedJob = &types.ReplicatedJob{
                        MaxConcurrent:    &t.ReplicatedJob.MaxConcurrent,
                        TotalCompletions: &t.ReplicatedJob.TotalCompletions,
                }
        case *swarmapi.ServiceSpec_GlobalJob:
                convertedSpec.Mode.GlobalJob = &types.GlobalJob{}
        }

        return convertedSpec, nil
}

// ServiceSpecToGRPC converts a ServiceSpec to a grpc ServiceSpec.
func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) {
        name := s.Name
        if name == "" {
                name = namesgenerator.GetRandomName(0)
        }

        serviceNetworks := make([]*swarmapi.NetworkAttachmentConfig, 0, len(s.Networks)) //nolint:staticcheck // ignore SA1019: field is deprecated.
        for _, n := range s.Networks {                                                   //nolint:staticcheck // ignore SA1019: field is deprecated.
                netConfig := &swarmapi.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases, DriverAttachmentOpts: n.DriverOpts}
                serviceNetworks = append(serviceNetworks, netConfig)
        }

        taskNetworks := make([]*swarmapi.NetworkAttachmentConfig, 0, len(s.TaskTemplate.Networks))
        for _, n := range s.TaskTemplate.Networks {
                netConfig := &swarmapi.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases, DriverAttachmentOpts: n.DriverOpts}
                taskNetworks = append(taskNetworks, netConfig)
        }

        spec := swarmapi.ServiceSpec{
                Annotations: swarmapi.Annotations{
                        Name:   name,
                        Labels: s.Labels,
                },
                Task: swarmapi.TaskSpec{
                        Resources:   resourcesToGRPC(s.TaskTemplate.Resources),
                        LogDriver:   driverToGRPC(s.TaskTemplate.LogDriver),
                        Networks:    taskNetworks,
                        ForceUpdate: s.TaskTemplate.ForceUpdate,
                },
                Networks: serviceNetworks,
        }

        switch s.TaskTemplate.Runtime {
        case types.RuntimeContainer, "": // if empty runtime default to container
                if s.TaskTemplate.ContainerSpec != nil {
                        containerSpec, err := containerToGRPC(s.TaskTemplate.ContainerSpec)
                        if err != nil {
                                return swarmapi.ServiceSpec{}, err
                        }
                        if s.TaskTemplate.Resources != nil && s.TaskTemplate.Resources.Limits != nil {
                                // TODO remove this (or keep for backward compat) once SwarmKit API moved PidsLimit into Resources
                                containerSpec.PidsLimit = s.TaskTemplate.Resources.Limits.Pids
                        }
                        spec.Task.Runtime = &swarmapi.TaskSpec_Container{Container: containerSpec}
                } else {
                        // If the ContainerSpec is nil, we can't set the task runtime
                        return swarmapi.ServiceSpec{}, ErrMismatchedRuntime
                }
        case types.RuntimePlugin:
                if s.TaskTemplate.PluginSpec != nil {
                        if s.Mode.Replicated != nil {
                                return swarmapi.ServiceSpec{}, errors.New("plugins must not use replicated mode")
                        }

                        s.Mode.Global = &types.GlobalService{} // must always be global

                        pluginSpec, err := proto.Marshal(s.TaskTemplate.PluginSpec)
                        if err != nil {
                                return swarmapi.ServiceSpec{}, err
                        }
                        spec.Task.Runtime = &swarmapi.TaskSpec_Generic{
                                Generic: &swarmapi.GenericRuntimeSpec{
                                        Kind: string(types.RuntimePlugin),
                                        Payload: &gogotypes.Any{
                                                TypeUrl: string(types.RuntimeURLPlugin),
                                                Value:   pluginSpec,
                                        },
                                },
                        }
                } else {
                        return swarmapi.ServiceSpec{}, ErrMismatchedRuntime
                }
        case types.RuntimeNetworkAttachment:
                // NOTE(dperny) I'm leaving this case here for completeness. The actual
                // code is left out deliberately, as we should refuse to parse a
                // Network Attachment runtime; it will cause weird behavior all over
                // the system if we do. Instead, fallthrough and return
                // ErrUnsupportedRuntime if we get one.
                fallthrough
        default:
                return swarmapi.ServiceSpec{}, ErrUnsupportedRuntime
        }

        restartPolicy, err := restartPolicyToGRPC(s.TaskTemplate.RestartPolicy)
        if err != nil {
                return swarmapi.ServiceSpec{}, err
        }
        spec.Task.Restart = restartPolicy

        if s.TaskTemplate.Placement != nil {
                var preferences []*swarmapi.PlacementPreference
                for _, pref := range s.TaskTemplate.Placement.Preferences {
                        if pref.Spread != nil {
                                preferences = append(preferences, &swarmapi.PlacementPreference{
                                        Preference: &swarmapi.PlacementPreference_Spread{
                                                Spread: &swarmapi.SpreadOver{
                                                        SpreadDescriptor: pref.Spread.SpreadDescriptor,
                                                },
                                        },
                                })
                        }
                }
                var platforms []*swarmapi.Platform
                for _, plat := range s.TaskTemplate.Placement.Platforms {
                        platforms = append(platforms, &swarmapi.Platform{
                                Architecture: plat.Architecture,
                                OS:           plat.OS,
                        })
                }
                spec.Task.Placement = &swarmapi.Placement{
                        Constraints: s.TaskTemplate.Placement.Constraints,
                        Preferences: preferences,
                        MaxReplicas: s.TaskTemplate.Placement.MaxReplicas,
                        Platforms:   platforms,
                }
        }

        spec.Update, err = updateConfigToGRPC(s.UpdateConfig)
        if err != nil {
                return swarmapi.ServiceSpec{}, err
        }
        spec.Rollback, err = updateConfigToGRPC(s.RollbackConfig)
        if err != nil {
                return swarmapi.ServiceSpec{}, err
        }

        if s.EndpointSpec != nil {
                if s.EndpointSpec.Mode != "" &&
                        s.EndpointSpec.Mode != types.ResolutionModeVIP &&
                        s.EndpointSpec.Mode != types.ResolutionModeDNSRR {
                        return swarmapi.ServiceSpec{}, fmt.Errorf("invalid resolution mode: %q", s.EndpointSpec.Mode)
                }

                spec.Endpoint = &swarmapi.EndpointSpec{}

                spec.Endpoint.Mode = swarmapi.EndpointSpec_ResolutionMode(swarmapi.EndpointSpec_ResolutionMode_value[strings.ToUpper(string(s.EndpointSpec.Mode))])

                for _, portConfig := range s.EndpointSpec.Ports {
                        spec.Endpoint.Ports = append(spec.Endpoint.Ports, &swarmapi.PortConfig{
                                Name:          portConfig.Name,
                                Protocol:      swarmapi.PortConfig_Protocol(swarmapi.PortConfig_Protocol_value[strings.ToUpper(string(portConfig.Protocol))]),
                                PublishMode:   swarmapi.PortConfig_PublishMode(swarmapi.PortConfig_PublishMode_value[strings.ToUpper(string(portConfig.PublishMode))]),
                                TargetPort:    portConfig.TargetPort,
                                PublishedPort: portConfig.PublishedPort,
                        })
                }
        }

        // Mode
        numModes := 0
        if s.Mode.Global != nil {
                numModes++
        }
        if s.Mode.Replicated != nil {
                numModes++
        }
        if s.Mode.ReplicatedJob != nil {
                numModes++
        }
        if s.Mode.GlobalJob != nil {
                numModes++
        }

        if numModes > 1 {
                return swarmapi.ServiceSpec{}, errors.New("must specify only one service mode")
        }

        if s.Mode.Global != nil {
                spec.Mode = &swarmapi.ServiceSpec_Global{
                        Global: &swarmapi.GlobalService{},
                }
        } else if s.Mode.GlobalJob != nil {
                spec.Mode = &swarmapi.ServiceSpec_GlobalJob{
                        GlobalJob: &swarmapi.GlobalJob{},
                }
        } else if s.Mode.ReplicatedJob != nil {
                // if the service is a replicated job, we have two different kinds of
                // values that might need to be defaulted.

                r := &swarmapi.ReplicatedJob{}
                if s.Mode.ReplicatedJob.MaxConcurrent != nil {
                        r.MaxConcurrent = *s.Mode.ReplicatedJob.MaxConcurrent
                } else {
                        r.MaxConcurrent = 1
                }

                if s.Mode.ReplicatedJob.TotalCompletions != nil {
                        r.TotalCompletions = *s.Mode.ReplicatedJob.TotalCompletions
                } else {
                        r.TotalCompletions = r.MaxConcurrent
                }

                spec.Mode = &swarmapi.ServiceSpec_ReplicatedJob{
                        ReplicatedJob: r,
                }
        } else if s.Mode.Replicated != nil && s.Mode.Replicated.Replicas != nil {
                spec.Mode = &swarmapi.ServiceSpec_Replicated{
                        Replicated: &swarmapi.ReplicatedService{Replicas: *s.Mode.Replicated.Replicas},
                }
        } else {
                spec.Mode = &swarmapi.ServiceSpec_Replicated{
                        Replicated: &swarmapi.ReplicatedService{Replicas: 1},
                }
        }

        return spec, nil
}

func annotationsFromGRPC(ann swarmapi.Annotations) types.Annotations {
        a := types.Annotations{
                Name:   ann.Name,
                Labels: ann.Labels,
        }

        if a.Labels == nil {
                a.Labels = make(map[string]string)
        }

        return a
}

// GenericResourcesFromGRPC converts a GRPC GenericResource to a GenericResource
func GenericResourcesFromGRPC(genericRes []*swarmapi.GenericResource) []types.GenericResource {
        var generic []types.GenericResource
        for _, res := range genericRes {
                var current types.GenericResource

                switch r := res.Resource.(type) {
                case *swarmapi.GenericResource_DiscreteResourceSpec:
                        current.DiscreteResourceSpec = &types.DiscreteGenericResource{
                                Kind:  r.DiscreteResourceSpec.Kind,
                                Value: r.DiscreteResourceSpec.Value,
                        }
                case *swarmapi.GenericResource_NamedResourceSpec:
                        current.NamedResourceSpec = &types.NamedGenericResource{
                                Kind:  r.NamedResourceSpec.Kind,
                                Value: r.NamedResourceSpec.Value,
                        }
                }

                generic = append(generic, current)
        }

        return generic
}

// resourcesFromGRPC creates a ResourceRequirements from the GRPC TaskSpec.
// We currently require the whole TaskSpec to be passed, because PidsLimit
// is returned as part of the container spec, instead of Resources
// TODO move PidsLimit to Resources in the Swarm API
func resourcesFromGRPC(ts *swarmapi.TaskSpec) *types.ResourceRequirements {
        var resources *types.ResourceRequirements

        if cs := ts.GetContainer(); cs != nil && cs.PidsLimit != 0 {
                resources = &types.ResourceRequirements{
                        Limits: &types.Limit{
                                Pids: cs.PidsLimit,
                        },
                }
        }
        if ts.Resources != nil {
                if resources == nil {
                        resources = &types.ResourceRequirements{}
                }
                res := ts.Resources
                if res.Limits != nil {
                        if resources.Limits == nil {
                                resources.Limits = &types.Limit{}
                        }
                        resources.Limits.NanoCPUs = res.Limits.NanoCPUs
                        resources.Limits.MemoryBytes = res.Limits.MemoryBytes
                }
                if res.Reservations != nil {
                        resources.Reservations = &types.Resources{
                                NanoCPUs:         res.Reservations.NanoCPUs,
                                MemoryBytes:      res.Reservations.MemoryBytes,
                                GenericResources: GenericResourcesFromGRPC(res.Reservations.Generic),
                        }
                }
        }

        return resources
}

// GenericResourcesToGRPC converts a GenericResource to a GRPC GenericResource
func GenericResourcesToGRPC(genericRes []types.GenericResource) []*swarmapi.GenericResource {
        var generic []*swarmapi.GenericResource
        for _, res := range genericRes {
                var r *swarmapi.GenericResource

                if res.DiscreteResourceSpec != nil {
                        r = genericresource.NewDiscrete(res.DiscreteResourceSpec.Kind, res.DiscreteResourceSpec.Value)
                } else if res.NamedResourceSpec != nil {
                        r = genericresource.NewString(res.NamedResourceSpec.Kind, res.NamedResourceSpec.Value)
                }

                generic = append(generic, r)
        }

        return generic
}

func resourcesToGRPC(res *types.ResourceRequirements) *swarmapi.ResourceRequirements {
        var reqs *swarmapi.ResourceRequirements
        if res != nil {
                reqs = &swarmapi.ResourceRequirements{}
                if res.Limits != nil {
                        // TODO add PidsLimit once Swarm API has been updated to move it into Limits
                        reqs.Limits = &swarmapi.Resources{
                                NanoCPUs:    res.Limits.NanoCPUs,
                                MemoryBytes: res.Limits.MemoryBytes,
                        }
                }
                if res.Reservations != nil {
                        reqs.Reservations = &swarmapi.Resources{
                                NanoCPUs:    res.Reservations.NanoCPUs,
                                MemoryBytes: res.Reservations.MemoryBytes,
                                Generic:     GenericResourcesToGRPC(res.Reservations.GenericResources),
                        }
                }
        }
        return reqs
}

func restartPolicyFromGRPC(p *swarmapi.RestartPolicy) *types.RestartPolicy {
        var rp *types.RestartPolicy
        if p != nil {
                rp = &types.RestartPolicy{}

                switch p.Condition {
                case swarmapi.RestartOnNone:
                        rp.Condition = types.RestartPolicyConditionNone
                case swarmapi.RestartOnFailure:
                        rp.Condition = types.RestartPolicyConditionOnFailure
                case swarmapi.RestartOnAny:
                        rp.Condition = types.RestartPolicyConditionAny
                default:
                        rp.Condition = types.RestartPolicyConditionAny
                }

                if p.Delay != nil {
                        delay, _ := gogotypes.DurationFromProto(p.Delay)
                        rp.Delay = &delay
                }
                if p.Window != nil {
                        window, _ := gogotypes.DurationFromProto(p.Window)
                        rp.Window = &window
                }

                rp.MaxAttempts = &p.MaxAttempts
        }
        return rp
}

func restartPolicyToGRPC(p *types.RestartPolicy) (*swarmapi.RestartPolicy, error) {
        var rp *swarmapi.RestartPolicy
        if p != nil {
                rp = &swarmapi.RestartPolicy{}

                switch p.Condition {
                case types.RestartPolicyConditionNone:
                        rp.Condition = swarmapi.RestartOnNone
                case types.RestartPolicyConditionOnFailure:
                        rp.Condition = swarmapi.RestartOnFailure
                case types.RestartPolicyConditionAny:
                        rp.Condition = swarmapi.RestartOnAny
                default:
                        if string(p.Condition) != "" {
                                return nil, fmt.Errorf("invalid RestartCondition: %q", p.Condition)
                        }
                        rp.Condition = swarmapi.RestartOnAny
                }

                if p.Delay != nil {
                        rp.Delay = gogotypes.DurationProto(*p.Delay)
                }
                if p.Window != nil {
                        rp.Window = gogotypes.DurationProto(*p.Window)
                }
                if p.MaxAttempts != nil {
                        rp.MaxAttempts = *p.MaxAttempts
                }
        }
        return rp, nil
}

func placementFromGRPC(p *swarmapi.Placement) *types.Placement {
        if p == nil {
                return nil
        }
        r := &types.Placement{
                Constraints: p.Constraints,
                MaxReplicas: p.MaxReplicas,
        }

        for _, pref := range p.Preferences {
                if spread := pref.GetSpread(); spread != nil {
                        r.Preferences = append(r.Preferences, types.PlacementPreference{
                                Spread: &types.SpreadOver{
                                        SpreadDescriptor: spread.SpreadDescriptor,
                                },
                        })
                }
        }

        for _, plat := range p.Platforms {
                r.Platforms = append(r.Platforms, types.Platform{
                        Architecture: plat.Architecture,
                        OS:           plat.OS,
                })
        }

        return r
}

func driverFromGRPC(p *swarmapi.Driver) *types.Driver {
        if p == nil {
                return nil
        }

        return &types.Driver{
                Name:    p.Name,
                Options: p.Options,
        }
}

func driverToGRPC(p *types.Driver) *swarmapi.Driver {
        if p == nil {
                return nil
        }

        return &swarmapi.Driver{
                Name:    p.Name,
                Options: p.Options,
        }
}

func updateConfigFromGRPC(updateConfig *swarmapi.UpdateConfig) *types.UpdateConfig {
        if updateConfig == nil {
                return nil
        }

        converted := &types.UpdateConfig{
                Parallelism:     updateConfig.Parallelism,
                MaxFailureRatio: updateConfig.MaxFailureRatio,
        }

        converted.Delay = updateConfig.Delay
        if updateConfig.Monitor != nil {
                converted.Monitor, _ = gogotypes.DurationFromProto(updateConfig.Monitor)
        }

        switch updateConfig.FailureAction {
        case swarmapi.UpdateConfig_PAUSE:
                converted.FailureAction = types.UpdateFailureActionPause
        case swarmapi.UpdateConfig_CONTINUE:
                converted.FailureAction = types.UpdateFailureActionContinue
        case swarmapi.UpdateConfig_ROLLBACK:
                converted.FailureAction = types.UpdateFailureActionRollback
        }

        switch updateConfig.Order {
        case swarmapi.UpdateConfig_STOP_FIRST:
                converted.Order = types.UpdateOrderStopFirst
        case swarmapi.UpdateConfig_START_FIRST:
                converted.Order = types.UpdateOrderStartFirst
        }

        return converted
}

func updateConfigToGRPC(updateConfig *types.UpdateConfig) (*swarmapi.UpdateConfig, error) {
        if updateConfig == nil {
                return nil, nil
        }

        converted := &swarmapi.UpdateConfig{
                Parallelism:     updateConfig.Parallelism,
                Delay:           updateConfig.Delay,
                MaxFailureRatio: updateConfig.MaxFailureRatio,
        }

        switch updateConfig.FailureAction {
        case types.UpdateFailureActionPause, "":
                converted.FailureAction = swarmapi.UpdateConfig_PAUSE
        case types.UpdateFailureActionContinue:
                converted.FailureAction = swarmapi.UpdateConfig_CONTINUE
        case types.UpdateFailureActionRollback:
                converted.FailureAction = swarmapi.UpdateConfig_ROLLBACK
        default:
                return nil, fmt.Errorf("unrecognized update failure action %s", updateConfig.FailureAction)
        }
        if updateConfig.Monitor != 0 {
                converted.Monitor = gogotypes.DurationProto(updateConfig.Monitor)
        }

        switch updateConfig.Order {
        case types.UpdateOrderStopFirst, "":
                converted.Order = swarmapi.UpdateConfig_STOP_FIRST
        case types.UpdateOrderStartFirst:
                converted.Order = swarmapi.UpdateConfig_START_FIRST
        default:
                return nil, fmt.Errorf("unrecognized update order %s", updateConfig.Order)
        }

        return converted, nil
}

func networkAttachmentSpecFromGRPC(attachment swarmapi.NetworkAttachmentSpec) *types.NetworkAttachmentSpec {
        return &types.NetworkAttachmentSpec{
                ContainerID: attachment.ContainerID,
        }
}

func taskSpecFromGRPC(taskSpec swarmapi.TaskSpec) (types.TaskSpec, error) {
        taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(taskSpec.Networks))
        for _, n := range taskSpec.Networks {
                netConfig := types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases, DriverOpts: n.DriverAttachmentOpts}
                taskNetworks = append(taskNetworks, netConfig)
        }

        t := types.TaskSpec{
                Resources:     resourcesFromGRPC(&taskSpec),
                RestartPolicy: restartPolicyFromGRPC(taskSpec.Restart),
                Placement:     placementFromGRPC(taskSpec.Placement),
                LogDriver:     driverFromGRPC(taskSpec.LogDriver),
                Networks:      taskNetworks,
                ForceUpdate:   taskSpec.ForceUpdate,
        }

        switch taskSpec.GetRuntime().(type) {
        case *swarmapi.TaskSpec_Container, nil:
                c := taskSpec.GetContainer()
                if c != nil {
                        t.ContainerSpec = containerSpecFromGRPC(c)
                }
        case *swarmapi.TaskSpec_Generic:
                g := taskSpec.GetGeneric()
                if g != nil {
                        switch g.Kind {
                        case string(types.RuntimePlugin):
                                var p runtime.PluginSpec
                                if err := proto.Unmarshal(g.Payload.Value, &p); err != nil {
                                        return t, errors.Wrap(err, "error unmarshalling plugin spec")
                                }
                                t.PluginSpec = &p
                        }
                }
        case *swarmapi.TaskSpec_Attachment:
                a := taskSpec.GetAttachment()
                if a != nil {
                        t.NetworkAttachmentSpec = networkAttachmentSpecFromGRPC(*a)
                }
                t.Runtime = types.RuntimeNetworkAttachment
        }

        return t, nil
}

package convert

import (
        "fmt"
        "strings"

        gogotypes "github.com/gogo/protobuf/types"
        types "github.com/moby/moby/api/types/swarm"
        swarmapi "github.com/moby/swarmkit/v2/api"
        "github.com/moby/swarmkit/v2/ca"
)

// SwarmFromGRPC converts a grpc Cluster to a Swarm.
func SwarmFromGRPC(c swarmapi.Cluster) types.Swarm {
        swarm := types.Swarm{
                ClusterInfo: types.ClusterInfo{
                        ID: c.ID,
                        Spec: types.Spec{
                                Orchestration: types.OrchestrationConfig{
                                        TaskHistoryRetentionLimit: &c.Spec.Orchestration.TaskHistoryRetentionLimit,
                                },
                                Raft: types.RaftConfig{
                                        SnapshotInterval:           c.Spec.Raft.SnapshotInterval,
                                        KeepOldSnapshots:           &c.Spec.Raft.KeepOldSnapshots,
                                        LogEntriesForSlowFollowers: c.Spec.Raft.LogEntriesForSlowFollowers,
                                        HeartbeatTick:              int(c.Spec.Raft.HeartbeatTick),
                                        ElectionTick:               int(c.Spec.Raft.ElectionTick),
                                },
                                EncryptionConfig: types.EncryptionConfig{
                                        AutoLockManagers: c.Spec.EncryptionConfig.AutoLockManagers,
                                },
                                CAConfig: types.CAConfig{
                                        // do not include the signing CA cert or key (it should already be redacted via the swarm APIs) -
                                        // the key because it's secret, and the cert because otherwise doing a get + update on the spec
                                        // can cause issues because the key would be missing and the cert wouldn't
                                        ForceRotate: c.Spec.CAConfig.ForceRotate,
                                },
                        },
                        TLSInfo: types.TLSInfo{
                                TrustRoot: string(c.RootCA.CACert),
                        },
                        RootRotationInProgress: c.RootCA.RootRotation != nil,
                        DefaultAddrPool:        c.DefaultAddressPool,
                        SubnetSize:             c.SubnetSize,
                        DataPathPort:           c.VXLANUDPPort,
                },
                JoinTokens: types.JoinTokens{
                        Worker:  c.RootCA.JoinTokens.Worker,
                        Manager: c.RootCA.JoinTokens.Manager,
                },
        }

        issuerInfo, err := ca.IssuerFromAPIRootCA(&c.RootCA)
        if err == nil && issuerInfo != nil {
                swarm.TLSInfo.CertIssuerSubject = issuerInfo.Subject
                swarm.TLSInfo.CertIssuerPublicKey = issuerInfo.PublicKey
        }

        heartbeatPeriod, _ := gogotypes.DurationFromProto(c.Spec.Dispatcher.HeartbeatPeriod)
        swarm.Spec.Dispatcher.HeartbeatPeriod = heartbeatPeriod

        swarm.Spec.CAConfig.NodeCertExpiry, _ = gogotypes.DurationFromProto(c.Spec.CAConfig.NodeCertExpiry)

        for _, ca := range c.Spec.CAConfig.ExternalCAs {
                swarm.Spec.CAConfig.ExternalCAs = append(swarm.Spec.CAConfig.ExternalCAs, &types.ExternalCA{
                        Protocol: types.ExternalCAProtocol(strings.ToLower(ca.Protocol.String())),
                        URL:      ca.URL,
                        Options:  ca.Options,
                        CACert:   string(ca.CACert),
                })
        }

        // Meta
        swarm.Version.Index = c.Meta.Version.Index
        swarm.CreatedAt, _ = gogotypes.TimestampFromProto(c.Meta.CreatedAt)
        swarm.UpdatedAt, _ = gogotypes.TimestampFromProto(c.Meta.UpdatedAt)

        // Annotations
        swarm.Spec.Annotations = annotationsFromGRPC(c.Spec.Annotations)

        return swarm
}

// SwarmSpecToGRPC converts a Spec to a grpc ClusterSpec.
func SwarmSpecToGRPC(s types.Spec) (swarmapi.ClusterSpec, error) {
        return MergeSwarmSpecToGRPC(s, swarmapi.ClusterSpec{})
}

// MergeSwarmSpecToGRPC merges a Spec with an initial grpc ClusterSpec
func MergeSwarmSpecToGRPC(s types.Spec, spec swarmapi.ClusterSpec) (swarmapi.ClusterSpec, error) {
        // We take the initSpec (either created from scratch, or returned by swarmkit),
        // and will only change the value if the one taken from types.Spec is not nil or 0.
        // In other words, if the value taken from types.Spec is nil or 0, we will maintain the status quo.
        if s.Annotations.Name != "" {
                spec.Annotations.Name = s.Annotations.Name
        }
        if len(s.Annotations.Labels) != 0 {
                spec.Annotations.Labels = s.Annotations.Labels
        }

        if s.Orchestration.TaskHistoryRetentionLimit != nil {
                spec.Orchestration.TaskHistoryRetentionLimit = *s.Orchestration.TaskHistoryRetentionLimit
        }
        if s.Raft.SnapshotInterval != 0 {
                spec.Raft.SnapshotInterval = s.Raft.SnapshotInterval
        }
        if s.Raft.KeepOldSnapshots != nil {
                spec.Raft.KeepOldSnapshots = *s.Raft.KeepOldSnapshots
        }
        if s.Raft.LogEntriesForSlowFollowers != 0 {
                spec.Raft.LogEntriesForSlowFollowers = s.Raft.LogEntriesForSlowFollowers
        }
        if s.Raft.HeartbeatTick != 0 {
                spec.Raft.HeartbeatTick = uint32(s.Raft.HeartbeatTick)
        }
        if s.Raft.ElectionTick != 0 {
                spec.Raft.ElectionTick = uint32(s.Raft.ElectionTick)
        }
        if s.Dispatcher.HeartbeatPeriod != 0 {
                spec.Dispatcher.HeartbeatPeriod = gogotypes.DurationProto(s.Dispatcher.HeartbeatPeriod)
        }
        if s.CAConfig.NodeCertExpiry != 0 {
                spec.CAConfig.NodeCertExpiry = gogotypes.DurationProto(s.CAConfig.NodeCertExpiry)
        }
        if s.CAConfig.SigningCACert != "" {
                spec.CAConfig.SigningCACert = []byte(s.CAConfig.SigningCACert)
        }
        if s.CAConfig.SigningCAKey != "" {
                // do propagate the signing CA key here because we want to provide it TO the swarm APIs
                spec.CAConfig.SigningCAKey = []byte(s.CAConfig.SigningCAKey)
        }
        spec.CAConfig.ForceRotate = s.CAConfig.ForceRotate

        for _, ca := range s.CAConfig.ExternalCAs {
                protocol, ok := swarmapi.ExternalCA_CAProtocol_value[strings.ToUpper(string(ca.Protocol))]
                if !ok {
                        return swarmapi.ClusterSpec{}, fmt.Errorf("invalid protocol: %q", ca.Protocol)
                }
                spec.CAConfig.ExternalCAs = append(spec.CAConfig.ExternalCAs, &swarmapi.ExternalCA{
                        Protocol: swarmapi.ExternalCA_CAProtocol(protocol),
                        URL:      ca.URL,
                        Options:  ca.Options,
                        CACert:   []byte(ca.CACert),
                })
        }

        spec.EncryptionConfig.AutoLockManagers = s.EncryptionConfig.AutoLockManagers

        return spec, nil
}

package convert

import (
        "strings"

        gogotypes "github.com/gogo/protobuf/types"
        types "github.com/moby/moby/api/types/swarm"
        swarmapi "github.com/moby/swarmkit/v2/api"
)

// TaskFromGRPC converts a grpc Task to a Task.
func TaskFromGRPC(t swarmapi.Task) (types.Task, error) {
        containerStatus := t.Status.GetContainer()
        taskSpec, err := taskSpecFromGRPC(t.Spec)
        if err != nil {
                return types.Task{}, err
        }
        task := types.Task{
                ID:          t.ID,
                Annotations: annotationsFromGRPC(t.Annotations),
                ServiceID:   t.ServiceID,
                Slot:        int(t.Slot),
                NodeID:      t.NodeID,
                Spec:        taskSpec,
                Status: types.TaskStatus{
                        State:   types.TaskState(strings.ToLower(t.Status.State.String())),
                        Message: t.Status.Message,
                        Err:     t.Status.Err,
                },
                DesiredState:     types.TaskState(strings.ToLower(t.DesiredState.String())),
                GenericResources: GenericResourcesFromGRPC(t.AssignedGenericResources),
        }

        // Meta
        task.Version.Index = t.Meta.Version.Index
        task.CreatedAt, _ = gogotypes.TimestampFromProto(t.Meta.CreatedAt)
        task.UpdatedAt, _ = gogotypes.TimestampFromProto(t.Meta.UpdatedAt)

        task.Status.Timestamp, _ = gogotypes.TimestampFromProto(t.Status.Timestamp)

        if containerStatus != nil {
                task.Status.ContainerStatus = &types.ContainerStatus{
                        ContainerID: containerStatus.ContainerID,
                        PID:         int(containerStatus.PID),
                        ExitCode:    int(containerStatus.ExitCode),
                }
        }

        // NetworksAttachments
        for _, na := range t.Networks {
                task.NetworksAttachments = append(task.NetworksAttachments, networkAttachmentFromGRPC(na))
        }

        if t.JobIteration != nil {
                task.JobIteration = &types.Version{
                        Index: t.JobIteration.Index,
                }
        }

        // appending to a nil slice is valid. if there are no items in t.Volumes,
        // then the task.Volumes will remain nil; otherwise, it will contain
        // converted entries.
        for _, v := range t.Volumes {
                task.Volumes = append(task.Volumes, types.VolumeAttachment{
                        ID:     v.ID,
                        Source: v.Source,
                        Target: v.Target,
                })
        }

        if t.Status.PortStatus == nil {
                return task, nil
        }

        for _, p := range t.Status.PortStatus.Ports {
                task.Status.PortStatus.Ports = append(task.Status.PortStatus.Ports, types.PortConfig{
                        Name:          p.Name,
                        Protocol:      types.PortConfigProtocol(strings.ToLower(swarmapi.PortConfig_Protocol_name[int32(p.Protocol)])),
                        PublishMode:   types.PortConfigPublishMode(strings.ToLower(swarmapi.PortConfig_PublishMode_name[int32(p.PublishMode)])),
                        TargetPort:    p.TargetPort,
                        PublishedPort: p.PublishedPort,
                })
        }

        return task, nil
}

package convert

import (
        gogotypes "github.com/gogo/protobuf/types"
        volumetypes "github.com/moby/moby/api/types/volume"
        swarmapi "github.com/moby/swarmkit/v2/api"
)

// VolumeFromGRPC converts a swarmkit api Volume object to a docker api Volume
// object
func VolumeFromGRPC(v *swarmapi.Volume) volumetypes.Volume {
        clusterVolumeSpec := volumetypes.ClusterVolumeSpec{
                Group:                     v.Spec.Group,
                AccessMode:                accessModeFromGRPC(v.Spec.AccessMode),
                AccessibilityRequirements: topologyRequirementFromGRPC(v.Spec.AccessibilityRequirements),
                CapacityRange:             capacityRangeFromGRPC(v.Spec.CapacityRange),
                Secrets:                   volumeSecretsFromGRPC(v.Spec.Secrets),
                Availability:              volumeAvailabilityFromGRPC(v.Spec.Availability),
        }

        clusterVolume := &volumetypes.ClusterVolume{
                ID:            v.ID,
                Spec:          clusterVolumeSpec,
                PublishStatus: volumePublishStatusFromGRPC(v.PublishStatus),
                Info:          volumeInfoFromGRPC(v.VolumeInfo),
        }

        clusterVolume.Version.Index = v.Meta.Version.Index
        clusterVolume.CreatedAt, _ = gogotypes.TimestampFromProto(v.Meta.CreatedAt)
        clusterVolume.UpdatedAt, _ = gogotypes.TimestampFromProto(v.Meta.UpdatedAt)

        return volumetypes.Volume{
                ClusterVolume: clusterVolume,
                CreatedAt:     clusterVolume.CreatedAt.String(),
                Driver:        v.Spec.Driver.Name,
                Labels:        v.Spec.Annotations.Labels,
                Name:          v.Spec.Annotations.Name,
                Options:       v.Spec.Driver.Options,
                Scope:         "global",
        }
}

func volumeSpecToGRPC(spec volumetypes.ClusterVolumeSpec) *swarmapi.VolumeSpec {
        swarmSpec := &swarmapi.VolumeSpec{
                Group: spec.Group,
        }

        if spec.AccessMode != nil {
                swarmSpec.AccessMode = &swarmapi.VolumeAccessMode{}

                switch spec.AccessMode.Scope {
                case volumetypes.ScopeSingleNode:
                        swarmSpec.AccessMode.Scope = swarmapi.VolumeScopeSingleNode
                case volumetypes.ScopeMultiNode:
                        swarmSpec.AccessMode.Scope = swarmapi.VolumeScopeMultiNode
                }

                switch spec.AccessMode.Sharing {
                case volumetypes.SharingNone:
                        swarmSpec.AccessMode.Sharing = swarmapi.VolumeSharingNone
                case volumetypes.SharingReadOnly:
                        swarmSpec.AccessMode.Sharing = swarmapi.VolumeSharingReadOnly
                case volumetypes.SharingOneWriter:
                        swarmSpec.AccessMode.Sharing = swarmapi.VolumeSharingOneWriter
                case volumetypes.SharingAll:
                        swarmSpec.AccessMode.Sharing = swarmapi.VolumeSharingAll
                }

                if spec.AccessMode.BlockVolume != nil {
                        swarmSpec.AccessMode.AccessType = &swarmapi.VolumeAccessMode_Block{
                                Block: &swarmapi.VolumeAccessMode_BlockVolume{},
                        }
                }
                if spec.AccessMode.MountVolume != nil {
                        swarmSpec.AccessMode.AccessType = &swarmapi.VolumeAccessMode_Mount{
                                Mount: &swarmapi.VolumeAccessMode_MountVolume{
                                        FsType:     spec.AccessMode.MountVolume.FsType,
                                        MountFlags: spec.AccessMode.MountVolume.MountFlags,
                                },
                        }
                }
        }

        for _, secret := range spec.Secrets {
                swarmSpec.Secrets = append(swarmSpec.Secrets, &swarmapi.VolumeSecret{
                        Key:    secret.Key,
                        Secret: secret.Secret,
                })
        }

        if spec.AccessibilityRequirements != nil {
                swarmSpec.AccessibilityRequirements = &swarmapi.TopologyRequirement{}

                for _, top := range spec.AccessibilityRequirements.Requisite {
                        swarmSpec.AccessibilityRequirements.Requisite = append(
                                swarmSpec.AccessibilityRequirements.Requisite,
                                &swarmapi.Topology{
                                        Segments: top.Segments,
                                },
                        )
                }

                for _, top := range spec.AccessibilityRequirements.Preferred {
                        swarmSpec.AccessibilityRequirements.Preferred = append(
                                swarmSpec.AccessibilityRequirements.Preferred,
                                &swarmapi.Topology{
                                        Segments: top.Segments,
                                },
                        )
                }
        }

        if spec.CapacityRange != nil {
                swarmSpec.CapacityRange = &swarmapi.CapacityRange{
                        RequiredBytes: spec.CapacityRange.RequiredBytes,
                        LimitBytes:    spec.CapacityRange.LimitBytes,
                }
        }

        // availability is not a pointer, it is a value. if the user does not
        // specify an availability, it will be inferred as the 0-value, which is
        // "active".
        switch spec.Availability {
        case volumetypes.AvailabilityActive:
                swarmSpec.Availability = swarmapi.VolumeAvailabilityActive
        case volumetypes.AvailabilityPause:
                swarmSpec.Availability = swarmapi.VolumeAvailabilityPause
        case volumetypes.AvailabilityDrain:
                swarmSpec.Availability = swarmapi.VolumeAvailabilityDrain
        }

        return swarmSpec
}

// VolumeCreateToGRPC takes a VolumeCreateBody and outputs the matching
// swarmapi VolumeSpec.
func VolumeCreateToGRPC(volume *volumetypes.CreateOptions) *swarmapi.VolumeSpec {
        var swarmSpec *swarmapi.VolumeSpec
        if volume != nil && volume.ClusterVolumeSpec != nil {
                swarmSpec = volumeSpecToGRPC(*volume.ClusterVolumeSpec)
        } else {
                swarmSpec = &swarmapi.VolumeSpec{}
        }

        swarmSpec.Annotations = swarmapi.Annotations{
                Name:   volume.Name,
                Labels: volume.Labels,
        }

        swarmSpec.Driver = &swarmapi.Driver{
                Name:    volume.Driver,
                Options: volume.DriverOpts,
        }

        return swarmSpec
}

func volumeInfoFromGRPC(info *swarmapi.VolumeInfo) *volumetypes.Info {
        if info == nil {
                return nil
        }

        var accessibleTopology []volumetypes.Topology
        if info.AccessibleTopology != nil {
                accessibleTopology = make([]volumetypes.Topology, len(info.AccessibleTopology))
                for i, top := range info.AccessibleTopology {
                        accessibleTopology[i] = topologyFromGRPC(top)
                }
        }

        return &volumetypes.Info{
                CapacityBytes:      info.CapacityBytes,
                VolumeContext:      info.VolumeContext,
                VolumeID:           info.VolumeID,
                AccessibleTopology: accessibleTopology,
        }
}

func volumePublishStatusFromGRPC(publishStatus []*swarmapi.VolumePublishStatus) []*volumetypes.PublishStatus {
        if publishStatus == nil {
                return nil
        }

        vps := make([]*volumetypes.PublishStatus, len(publishStatus))
        for i, status := range publishStatus {
                var state volumetypes.PublishState
                switch status.State {
                case swarmapi.VolumePublishStatus_PENDING_PUBLISH:
                        state = volumetypes.StatePending
                case swarmapi.VolumePublishStatus_PUBLISHED:
                        state = volumetypes.StatePublished
                case swarmapi.VolumePublishStatus_PENDING_NODE_UNPUBLISH:
                        state = volumetypes.StatePendingNodeUnpublish
                case swarmapi.VolumePublishStatus_PENDING_UNPUBLISH:
                        state = volumetypes.StatePendingUnpublish
                }

                vps[i] = &volumetypes.PublishStatus{
                        NodeID:         status.NodeID,
                        State:          state,
                        PublishContext: status.PublishContext,
                }
        }

        return vps
}

func accessModeFromGRPC(accessMode *swarmapi.VolumeAccessMode) *volumetypes.AccessMode {
        if accessMode == nil {
                return nil
        }

        convertedAccessMode := &volumetypes.AccessMode{}

        switch accessMode.Scope {
        case swarmapi.VolumeScopeSingleNode:
                convertedAccessMode.Scope = volumetypes.ScopeSingleNode
        case swarmapi.VolumeScopeMultiNode:
                convertedAccessMode.Scope = volumetypes.ScopeMultiNode
        }

        switch accessMode.Sharing {
        case swarmapi.VolumeSharingNone:
                convertedAccessMode.Sharing = volumetypes.SharingNone
        case swarmapi.VolumeSharingReadOnly:
                convertedAccessMode.Sharing = volumetypes.SharingReadOnly
        case swarmapi.VolumeSharingOneWriter:
                convertedAccessMode.Sharing = volumetypes.SharingOneWriter
        case swarmapi.VolumeSharingAll:
                convertedAccessMode.Sharing = volumetypes.SharingAll
        }

        if block := accessMode.GetBlock(); block != nil {
                convertedAccessMode.BlockVolume = &volumetypes.TypeBlock{}
        }
        if mount := accessMode.GetMount(); mount != nil {
                convertedAccessMode.MountVolume = &volumetypes.TypeMount{
                        FsType:     mount.FsType,
                        MountFlags: mount.MountFlags,
                }
        }

        return convertedAccessMode
}

func volumeSecretsFromGRPC(secrets []*swarmapi.VolumeSecret) []volumetypes.Secret {
        if secrets == nil {
                return nil
        }
        convertedSecrets := make([]volumetypes.Secret, len(secrets))
        for i, secret := range secrets {
                convertedSecrets[i] = volumetypes.Secret{
                        Key:    secret.Key,
                        Secret: secret.Secret,
                }
        }
        return convertedSecrets
}

func topologyRequirementFromGRPC(top *swarmapi.TopologyRequirement) *volumetypes.TopologyRequirement {
        if top == nil {
                return nil
        }

        convertedTop := &volumetypes.TopologyRequirement{}
        if top.Requisite != nil {
                convertedTop.Requisite = make([]volumetypes.Topology, len(top.Requisite))
                for i, req := range top.Requisite {
                        convertedTop.Requisite[i] = topologyFromGRPC(req)
                }
        }

        if top.Preferred != nil {
                convertedTop.Preferred = make([]volumetypes.Topology, len(top.Preferred))
                for i, pref := range top.Preferred {
                        convertedTop.Preferred[i] = topologyFromGRPC(pref)
                }
        }

        return convertedTop
}

func topologyFromGRPC(top *swarmapi.Topology) volumetypes.Topology {
        if top == nil {
                return volumetypes.Topology{}
        }
        return volumetypes.Topology{
                Segments: top.Segments,
        }
}

func capacityRangeFromGRPC(capacity *swarmapi.CapacityRange) *volumetypes.CapacityRange {
        if capacity == nil {
                return nil
        }

        return &volumetypes.CapacityRange{
                RequiredBytes: capacity.RequiredBytes,
                LimitBytes:    capacity.LimitBytes,
        }
}

func volumeAvailabilityFromGRPC(availability swarmapi.VolumeSpec_VolumeAvailability) volumetypes.Availability {
        switch availability {
        case swarmapi.VolumeAvailabilityActive:
                return volumetypes.AvailabilityActive
        case swarmapi.VolumeAvailabilityPause:
                return volumetypes.AvailabilityPause
        case swarmapi.VolumeAvailabilityDrain:
                return volumetypes.AvailabilityDrain
        default:
                return volumetypes.AvailabilityDrain
        }
}

package config

import (
        "encoding/json"
        "sort"
        "strings"

        bkconfig "github.com/moby/buildkit/cmd/buildkitd/config"
        "github.com/moby/moby/api/types/filters"
)

// BuilderGCRule represents a GC rule for buildkit cache
type BuilderGCRule struct {
        All           bool            `json:",omitempty"`
        Filter        BuilderGCFilter `json:",omitempty"`
        ReservedSpace string          `json:",omitempty"`
        MaxUsedSpace  string          `json:",omitempty"`
        MinFreeSpace  string          `json:",omitempty"`
}

func (x *BuilderGCRule) UnmarshalJSON(data []byte) error {
        var xx struct {
                All           bool            `json:",omitempty"`
                Filter        BuilderGCFilter `json:",omitempty"`
                ReservedSpace string          `json:",omitempty"`
                MaxUsedSpace  string          `json:",omitempty"`
                MinFreeSpace  string          `json:",omitempty"`

                // Deprecated option is now equivalent to ReservedSpace.
                KeepStorage string `json:",omitempty"`
        }
        if err := json.Unmarshal(data, &xx); err != nil {
                return err
        }

        x.All = xx.All
        x.Filter = xx.Filter
        x.ReservedSpace = xx.ReservedSpace
        x.MaxUsedSpace = xx.MaxUsedSpace
        x.MinFreeSpace = xx.MinFreeSpace
        if x.ReservedSpace == "" {
                x.ReservedSpace = xx.KeepStorage
        }
        return nil
}

// BuilderGCFilter contains garbage-collection filter rules for a BuildKit builder
type BuilderGCFilter filters.Args

// MarshalJSON returns a JSON byte representation of the BuilderGCFilter
func (x *BuilderGCFilter) MarshalJSON() ([]byte, error) {
        f := filters.Args(*x)
        keys := f.Keys()
        sort.Strings(keys)
        arr := make([]string, 0, len(keys))
        for _, k := range keys {
                values := f.Get(k)
                for _, v := range values {
                        arr = append(arr, k+"="+v)
                }
        }
        return json.Marshal(arr)
}

// UnmarshalJSON fills the BuilderGCFilter values structure from JSON input
func (x *BuilderGCFilter) UnmarshalJSON(data []byte) error {
        var arr []string
        f := filters.NewArgs()
        if err := json.Unmarshal(data, &arr); err != nil {
                // backwards compat for deprecated buggy form
                err := json.Unmarshal(data, &f)
                *x = BuilderGCFilter(f)
                return err
        }
        for _, s := range arr {
                name, value, _ := strings.Cut(s, "=")
                name = strings.ToLower(strings.TrimSpace(name))
                value = strings.TrimSpace(value)
                f.Add(name, value)
        }
        *x = BuilderGCFilter(f)
        return nil
}

// BuilderGCConfig contains GC config for a buildkit builder
type BuilderGCConfig struct {
        Enabled              *bool           `json:",omitempty"`
        Policy               []BuilderGCRule `json:",omitempty"`
        DefaultReservedSpace string          `json:",omitempty"`
        DefaultMaxUsedSpace  string          `json:",omitempty"`
        DefaultMinFreeSpace  string          `json:",omitempty"`
}

func (x *BuilderGCConfig) IsEnabled() bool {
        return x.Enabled == nil || *x.Enabled
}

func (x *BuilderGCConfig) UnmarshalJSON(data []byte) error {
        var xx struct {
                Enabled              bool            `json:",omitempty"`
                Policy               []BuilderGCRule `json:",omitempty"`
                DefaultReservedSpace string          `json:",omitempty"`
                DefaultMaxUsedSpace  string          `json:",omitempty"`
                DefaultMinFreeSpace  string          `json:",omitempty"`

                // Deprecated option is now equivalent to DefaultReservedSpace.
                DefaultKeepStorage string `json:",omitempty"`
        }

        // Set defaults.
        xx.Enabled = true

        if err := json.Unmarshal(data, &xx); err != nil {
                return err
        }

        x.Enabled = &xx.Enabled
        x.Policy = xx.Policy
        x.DefaultReservedSpace = xx.DefaultReservedSpace
        x.DefaultMaxUsedSpace = xx.DefaultMaxUsedSpace
        x.DefaultMinFreeSpace = xx.DefaultMinFreeSpace
        if x.DefaultReservedSpace == "" {
                x.DefaultReservedSpace = xx.DefaultKeepStorage
        }
        return nil
}

// BuilderHistoryConfig contains history config for a buildkit builder
type BuilderHistoryConfig struct {
        MaxAge     bkconfig.Duration `json:",omitempty"`
        MaxEntries int64             `json:",omitempty"`
}

// BuilderEntitlements contains settings to enable/disable entitlements
type BuilderEntitlements struct {
        NetworkHost      *bool `json:"network-host,omitempty"`
        SecurityInsecure *bool `json:"security-insecure,omitempty"`
}

// BuilderConfig contains config for the builder
type BuilderConfig struct {
        GC           BuilderGCConfig       `json:",omitempty"`
        Entitlements BuilderEntitlements   `json:",omitempty"`
        History      *BuilderHistoryConfig `json:",omitempty"`
}

package config

import (
        "bytes"
        "encoding/json"
        stderrors "errors"
        "fmt"
        "net"
        "net/netip"
        "net/url"
        "os"
        "strings"

        "dario.cat/mergo"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/pkg/opts"
        dopts "github.com/docker/docker/internal/opts"
        "github.com/docker/docker/registry"
        "github.com/moby/moby/api"
        "github.com/moby/moby/api/types/versions"
        "github.com/pkg/errors"
        "github.com/spf13/pflag"
        "golang.org/x/text/encoding"
        "golang.org/x/text/encoding/unicode"
        "golang.org/x/text/transform"
)

const (
        // DefaultMaxConcurrentDownloads is the default value for
        // maximum number of downloads that
        // may take place at a time.
        DefaultMaxConcurrentDownloads = 3
        // DefaultMaxConcurrentUploads is the default value for
        // maximum number of uploads that
        // may take place at a time.
        DefaultMaxConcurrentUploads = 5
        // DefaultDownloadAttempts is the default value for
        // maximum number of attempts that
        // may take place at a time for each pull when the connection is lost.
        DefaultDownloadAttempts = 5
        // DefaultShmSize is the default value for container's shm size (64 MiB)
        DefaultShmSize int64 = 64 * 1024 * 1024
        // DefaultNetworkMtu is the default value for network MTU
        DefaultNetworkMtu = 1500
        // DisableNetworkBridge is the default value of the option to disable network bridge
        DisableNetworkBridge = "none"
        // DefaultShutdownTimeout is the default shutdown timeout (in seconds) for
        // the daemon for containers to stop when it is shutting down.
        DefaultShutdownTimeout = 15
        // DefaultInitBinary is the name of the default init binary
        DefaultInitBinary = "docker-init"
        // DefaultRuntimeBinary is the default runtime to be used by
        // containerd if none is specified
        DefaultRuntimeBinary = "runc"
        // DefaultContainersNamespace is the name of the default containerd namespace used for users containers.
        DefaultContainersNamespace = "moby"
        // DefaultPluginNamespace is the name of the default containerd namespace used for plugins.
        DefaultPluginNamespace = "plugins.moby"
        // DefaultAPIVersion is the highest REST API version supported by the daemon.
        //
        // This version may be lower than the [api.DefaultVersion], which is the default
        // (and highest supported) version of the api library module used.
        DefaultAPIVersion = "1.52"
        // defaultMinAPIVersion is the minimum API version supported by the API.
        // This version can be overridden through the "DOCKER_MIN_API_VERSION"
        // environment variable. It currently defaults to the minimum API version
        // supported by the API server.
        defaultMinAPIVersion = api.MinSupportedAPIVersion
        // SeccompProfileDefault is the built-in default seccomp profile.
        SeccompProfileDefault = "builtin"
        // SeccompProfileUnconfined is a special profile name for seccomp to use an
        // "unconfined" seccomp profile.
        SeccompProfileUnconfined = "unconfined"
        // LibnetDataPath is the path to libnetwork's data directory, relative to cfg.Root.
        // Windows tolerates the "/".
        LibnetDataPath = "network/files"
)

// flatOptions contains configuration keys
// that MUST NOT be parsed as deep structures.
// Use this to differentiate these options
// with others like the ones in TLSOptions.
var flatOptions = map[string]bool{
        "cluster-store-opts":   true,
        "default-network-opts": true,
        "log-opts":             true,
        "runtimes":             true,
        "default-ulimits":      true,
        "features":             true,
        "builder":              true,
}

// skipValidateOptions contains configuration keys
// that will be skipped from findConfigurationConflicts
// for unknown flag validation.
var skipValidateOptions = map[string]bool{
        "features": true,
        "builder":  true,

        // Deprecated options that are safe to ignore if present.
        "deprecated-key-path":              true,
        "allow-nondistributable-artifacts": true,
}

// skipDuplicates contains configuration keys that
// will be skipped when checking duplicated
// configuration field defined in both daemon
// config file and from dockerd cli flags.
// This allows some configurations to be merged
// during the parsing.
var skipDuplicates = map[string]bool{
        "runtimes": true,
}

// migratedNamedConfig describes legacy configuration file keys that have been migrated
// from simple entries equivalent to command line flags, to a named option.
//
// For example, "host-gateway-ip" allowed for a single IP address. "host-gateway-ips"
// allows for an IPv4 and an IPv6 address, and is implemented as a NamedOption for
// command line flag "--host-gateway-ip".
//
// Each legacy name is mapped to its new name and a function that can be called to
// migrate config from one to the other. The migration function is only called after
// confirming that the option is only specified in one of the new, old or command
// line options.
var migratedNamedConfig = map[string]struct {
        newName string
        migrate func(*Config)
}{
        "host-gateway-ip": {newName: "host-gateway-ips", migrate: migrateHostGatewayIP},
}

// LogConfig represents the default log configuration.
// It includes json tags to deserialize configuration from a file
// using the same names that the flags in the command line use.
type LogConfig struct {
        Type   string            `json:"log-driver,omitempty"`
        Config map[string]string `json:"log-opts,omitempty"`
}

// commonBridgeConfig stores all the platform-common bridge driver specific
// configuration.
type commonBridgeConfig struct {
        Iface     string `json:"bridge,omitempty"`
        FixedCIDR string `json:"fixed-cidr,omitempty"`
}

// NetworkConfig stores the daemon-wide networking configurations
type NetworkConfig struct {
        // Default address pools for docker networks
        DefaultAddressPools opts.PoolsOpt `json:"default-address-pools,omitempty"`
        // NetworkControlPlaneMTU allows to specify the control plane MTU, this will allow to optimize the network use in some components
        NetworkControlPlaneMTU int `json:"network-control-plane-mtu,omitempty"`
        // Default options for newly created networks
        DefaultNetworkOpts map[string]map[string]string `json:"default-network-opts,omitempty"`
        // FirewallBackend overrides the daemon's default selection of firewall
        // implementation. Currently only used on Linux, it is an error to
        // supply a value for other platforms.
        FirewallBackend string `json:"firewall-backend,omitempty"`
}

// TLSOptions defines TLS configuration for the daemon server.
// It includes json tags to deserialize configuration from a file
// using the same names that the flags in the command line use.
type TLSOptions struct {
        CAFile   string `json:"tlscacert,omitempty"`
        CertFile string `json:"tlscert,omitempty"`
        KeyFile  string `json:"tlskey,omitempty"`
}

// DNSConfig defines the DNS configurations.
type DNSConfig struct {
        DNS            []net.IP     `json:"dns,omitempty"`
        DNSOptions     []string     `json:"dns-opts,omitempty"`
        DNSSearch      []string     `json:"dns-search,omitempty"`
        HostGatewayIP  net.IP       `json:"host-gateway-ip,omitempty"` // Deprecated: this single-IP is migrated to HostGatewayIPs
        HostGatewayIPs []netip.Addr `json:"host-gateway-ips,omitempty"`
}

// CommonConfig defines the configuration of a docker daemon which is
// common across platforms.
// It includes json tags to deserialize configuration from a file
// using the same names that the flags in the command line use.
type CommonConfig struct {
        AuthorizationPlugins  []string `json:"authorization-plugins,omitempty"` // AuthorizationPlugins holds list of authorization plugins
        AutoRestart           bool     `json:"-"`
        DisableBridge         bool     `json:"-"`
        ExecOptions           []string `json:"exec-opts,omitempty"`
        GraphDriver           string   `json:"storage-driver,omitempty"`
        GraphOptions          []string `json:"storage-opts,omitempty"`
        Labels                []string `json:"labels,omitempty"`
        NetworkDiagnosticPort int      `json:"network-diagnostic-port,omitempty"`
        Pidfile               string   `json:"pidfile,omitempty"`
        RawLogs               bool     `json:"raw-logs,omitempty"`
        Root                  string   `json:"data-root,omitempty"`
        ExecRoot              string   `json:"exec-root,omitempty"`
        SocketGroup           string   `json:"group,omitempty"`
        CorsHeaders           string   `json:"api-cors-header,omitempty"` // Deprecated: CORS headers should not be set on the API. This feature will be removed in the next release. // TODO(thaJeztah): option is used to produce error when used; remove in next release

        // Proxies holds the proxies that are configured for the daemon.
        Proxies `json:"proxies"`

        // LiveRestoreEnabled determines whether we should keep containers
        // alive upon daemon shutdown/start
        LiveRestoreEnabled bool `json:"live-restore,omitempty"`

        // MaxConcurrentDownloads is the maximum number of downloads that
        // may take place at a time for each pull.
        MaxConcurrentDownloads int `json:"max-concurrent-downloads,omitempty"`

        // MaxConcurrentUploads is the maximum number of uploads that
        // may take place at a time for each push.
        MaxConcurrentUploads int `json:"max-concurrent-uploads,omitempty"`

        // MaxDownloadAttempts is the maximum number of attempts that
        // may take place at a time for each push.
        MaxDownloadAttempts int `json:"max-download-attempts,omitempty"`

        // ShutdownTimeout is the timeout value (in seconds) the daemon will wait for the container
        // to stop when daemon is being shutdown
        ShutdownTimeout int `json:"shutdown-timeout,omitempty"`

        Debug     bool             `json:"debug,omitempty"`
        Hosts     []string         `json:"hosts,omitempty"`
        LogLevel  string           `json:"log-level,omitempty"`
        LogFormat log.OutputFormat `json:"log-format,omitempty"`
        TLS       *bool            `json:"tls,omitempty"`
        TLSVerify *bool            `json:"tlsverify,omitempty"`

        // Embedded structs that allow config
        // deserialization without the full struct.
        TLSOptions

        // SwarmDefaultAdvertiseAddr is the default host/IP or network interface
        // to use if a wildcard address is specified in the ListenAddr value
        // given to the /swarm/init endpoint and no advertise address is
        // specified.
        SwarmDefaultAdvertiseAddr string `json:"swarm-default-advertise-addr"`

        // SwarmRaftHeartbeatTick is the number of ticks in time for swarm mode raft quorum heartbeat
        // Typical value is 1
        SwarmRaftHeartbeatTick uint32 `json:"swarm-raft-heartbeat-tick"`

        // SwarmRaftElectionTick is the number of ticks to elapse before followers in the quorum can propose
        // a new round of leader election.  Default, recommended value is at least 10X that of Heartbeat tick.
        // Higher values can make the quorum less sensitive to transient faults in the environment, but this also
        // means it takes longer for the managers to detect a down leader.
        SwarmRaftElectionTick uint32 `json:"swarm-raft-election-tick"`

        MetricsAddress string `json:"metrics-addr"`

        DNSConfig
        LogConfig
        BridgeConfig // BridgeConfig holds bridge network specific configuration.
        NetworkConfig
        registry.ServiceOptions

        // FIXME(vdemeester) This part is not that clear and is mainly dependent on cli flags
        // It should probably be handled outside this package.
        ValuesSet map[string]interface{} `json:"-"`

        Experimental bool `json:"experimental"` // Experimental indicates whether experimental features should be exposed or not

        // Exposed node Generic Resources
        // e.g: ["orange=red", "orange=green", "orange=blue", "apple=3"]
        NodeGenericResources []string `json:"node-generic-resources,omitempty"`

        // ContainerAddr is the address used to connect to containerd if we're
        // not starting it ourselves
        ContainerdAddr string `json:"containerd,omitempty"`

        // CriContainerd determines whether a supervised containerd instance
        // should be configured with the CRI plugin enabled. This allows using
        // Docker's containerd instance directly with a Kubernetes kubelet.
        CriContainerd bool `json:"cri-containerd,omitempty"`

        // Features contains a list of feature key value pairs indicating what features are enabled or disabled.
        // If a certain feature doesn't appear in this list then it's unset (i.e. neither true nor false).
        Features map[string]bool `json:"features,omitempty"`

        Builder BuilderConfig `json:"builder,omitempty"`

        ContainerdNamespace       string `json:"containerd-namespace,omitempty"`
        ContainerdPluginNamespace string `json:"containerd-plugin-namespace,omitempty"`

        DefaultRuntime string `json:"default-runtime,omitempty"`

        // CDISpecDirs is a list of directories in which CDI specifications can be found.
        CDISpecDirs []string `json:"cdi-spec-dirs,omitempty"`

        // The minimum API version provided by the daemon. Defaults to [defaultMinAPIVersion].
        //
        // The DOCKER_MIN_API_VERSION allows overriding the minimum API version within
        // constraints of the minimum and maximum (current) supported API versions.
        //
        // API versions older than [defaultMinAPIVersion] are deprecated and
        // to be removed in a future release. The "DOCKER_MIN_API_VERSION" env
        // var should only be used for exceptional cases, and the MinAPIVersion
        // field is therefore not included in the JSON representation.
        MinAPIVersion string `json:"-"`
}

// Proxies holds the proxies that are configured for the daemon.
type Proxies struct {
        HTTPProxy  string `json:"http-proxy,omitempty"`
        HTTPSProxy string `json:"https-proxy,omitempty"`
        NoProxy    string `json:"no-proxy,omitempty"`
}

// IsValueSet returns true if a configuration value
// was explicitly set in the configuration file.
func (conf *Config) IsValueSet(name string) bool {
        if conf.ValuesSet == nil {
                return false
        }
        _, ok := conf.ValuesSet[name]
        return ok
}

// New returns a new fully initialized Config struct with default values set.
func New() (*Config, error) {
        // platform-agnostic default values for the Config.
        cfg := &Config{
                CommonConfig: CommonConfig{
                        ShutdownTimeout: DefaultShutdownTimeout,
                        LogConfig: LogConfig{
                                Config: make(map[string]string),
                        },
                        MaxConcurrentDownloads: DefaultMaxConcurrentDownloads,
                        MaxConcurrentUploads:   DefaultMaxConcurrentUploads,
                        MaxDownloadAttempts:    DefaultDownloadAttempts,
                        BridgeConfig: BridgeConfig{
                                DefaultBridgeConfig: DefaultBridgeConfig{
                                        MTU: DefaultNetworkMtu,
                                },
                        },
                        NetworkConfig: NetworkConfig{
                                NetworkControlPlaneMTU: DefaultNetworkMtu,
                                DefaultNetworkOpts:     make(map[string]map[string]string),
                        },
                        ContainerdNamespace:       DefaultContainersNamespace,
                        ContainerdPluginNamespace: DefaultPluginNamespace,
                        Features:                  make(map[string]bool),
                        DefaultRuntime:            StockRuntimeName,
                        MinAPIVersion:             defaultMinAPIVersion,
                },
        }

        if err := setPlatformDefaults(cfg); err != nil {
                return nil, err
        }

        return cfg, nil
}

// GetExecOpt looks up a user-configured exec-opt. It returns a boolean
// if found, and an error if the configuration has invalid options set.
func (conf *Config) GetExecOpt(name string) (val string, found bool, _ error) {
        o, err := parseExecOptions(conf.ExecOptions)
        if err != nil {
                return "", false, err
        }
        val, found = o[name]
        return val, found, nil
}

// GetConflictFreeLabels validates Labels for conflict
// In swarm the duplicates for labels are removed
// so we only take same values here, no conflict values
// If the key-value is the same we will only take the last label
func GetConflictFreeLabels(labels []string) ([]string, error) {
        labelMap := map[string]string{}
        for _, label := range labels {
                key, val, ok := strings.Cut(label, "=")
                if ok {
                        // If there is a conflict we will return an error
                        if v, ok := labelMap[key]; ok && v != val {
                                return nil, errors.Errorf("conflict labels for %s=%s and %s=%s", key, val, key, v)
                        }
                        labelMap[key] = val
                }
        }

        newLabels := []string{}
        for k, v := range labelMap {
                newLabels = append(newLabels, k+"="+v)
        }
        return newLabels, nil
}

// Reload reads the configuration in the host and reloads the daemon and server.
func Reload(configFile string, flags *pflag.FlagSet, reload func(*Config)) error {
        newConfig, err := getConflictFreeConfiguration(configFile, flags)
        if err != nil {
                if flags.Changed("config-file") || !os.IsNotExist(err) {
                        return errors.Wrapf(err, "unable to configure the Docker daemon with file %s", configFile)
                }
                newConfig, err = New()
                if err != nil {
                        return err
                }
        }

        // Check if duplicate label-keys with different values are found
        newLabels, err := GetConflictFreeLabels(newConfig.Labels)
        if err != nil {
                return err
        }
        newConfig.Labels = newLabels

        // TODO(thaJeztah) This logic is problematic and needs a rewrite;
        // This is validating newConfig before the "reload()" callback is executed.
        // At this point, newConfig may be a partial configuration, to be merged
        // with the existing configuration in the "reload()" callback. Validating
        // this config before it's merged can result in incorrect validation errors.
        //
        // However, the current "reload()" callback we use is DaemonCli.reloadConfig(),
        // which includes a call to Daemon.Reload(), which both performs "merging"
        // and validation, as well as actually updating the daemon configuration.
        // Calling DaemonCli.reloadConfig() *before* validation, could thus lead to
        // a failure in that function (making the reload non-atomic).
        //
        // While *some* errors could always occur when applying/updating the config,
        // we should make it more atomic, and;
        //
        // 1. get (a copy of) the active configuration
        // 2. get the new configuration
        // 3. apply the (reloadable) options from the new configuration
        // 4. validate the merged results
        // 5. apply the new configuration.
        if err := Validate(newConfig); err != nil {
                return errors.Wrap(err, "file configuration validation failed")
        }

        reload(newConfig)
        return nil
}

// boolValue is an interface that boolean value flags implement
// to tell the command line how to make -name equivalent to -name=true.
type boolValue interface {
        IsBoolFlag() bool
}

// MergeDaemonConfigurations reads a configuration file,
// loads the file configuration in an isolated structure,
// and merges the configuration provided from flags on top
// if there are no conflicts.
func MergeDaemonConfigurations(flagsConfig *Config, flags *pflag.FlagSet, configFile string) (*Config, error) {
        fileConfig, err := getConflictFreeConfiguration(configFile, flags)
        if err != nil {
                return nil, err
        }

        // merge flags configuration on top of the file configuration
        if err := mergo.Merge(fileConfig, flagsConfig); err != nil {
                return nil, err
        }

        // validate the merged fileConfig and flagsConfig
        if err := Validate(fileConfig); err != nil {
                return nil, errors.Wrap(err, "merged configuration validation from file and command line flags failed")
        }

        return fileConfig, nil
}

// getConflictFreeConfiguration loads the configuration from a JSON file.
// It compares that configuration with the one provided by the flags,
// and returns an error if there are conflicts.
func getConflictFreeConfiguration(configFile string, flags *pflag.FlagSet) (*Config, error) {
        b, err := os.ReadFile(configFile)
        if err != nil {
                return nil, err
        }

        // Decode the contents of the JSON file using a [byte order mark] if present, instead of assuming UTF-8 without BOM.
        // The BOM, if present, will be used to determine the encoding. If no BOM is present, we will assume the default
        // and preferred encoding for JSON as defined by [RFC 8259], UTF-8 without BOM.
        //
        // While JSON is normatively UTF-8 with no BOM, there are a couple of reasons to decode here:
        //   * UTF-8 with BOM is something that new implementations should avoid producing; however, [RFC 8259 Section 8.1]
        //     allows implementations to ignore the UTF-8 BOM when present for interoperability. Older versions of Notepad,
        //     the only text editor available out of the box on Windows Server, writes UTF-8 with a BOM by default.
        //   * The default encoding for [Windows PowerShell] is UTF-16 LE with BOM. While encodings in PowerShell can be a
        //     bit idiosyncratic, BOMs are still generally written. There is no support for selecting UTF-8 without a BOM as
        //     the encoding in Windows PowerShell, though some Cmdlets only write UTF-8 with no BOM. PowerShell Core
        //     introduces `utf8NoBOM` and makes it the default, but PowerShell Core is unlikely to be the implementation for
        //     a majority of Windows Server + PowerShell users.
        //   * While [RFC 8259 Section 8.1] asserts that software that is not part of a closed ecosystem or that crosses a
        //     network boundary should only support UTF-8, and should never write a BOM, it does acknowledge older versions
        //     of the standard, such as [RFC 7159 Section 8.1]. In the interest of pragmatism and easing pain for Windows
        //     users, we consider Windows tools such as Windows PowerShell and Notepad part of our ecosystem, and support
        //     the two most common encodings: UTF-16 LE with BOM, and UTF-8 with BOM, in addition to the standard UTF-8
        //     without BOM.
        //
        // [byte order mark]: https://www.unicode.org/faq/utf_bom.html#BOM
        // [RFC 8259]: https://www.rfc-editor.org/rfc/rfc8259
        // [RFC 8259 Section 8.1]: https://www.rfc-editor.org/rfc/rfc8259#section-8.1
        // [RFC 7159 Section 8.1]: https://www.rfc-editor.org/rfc/rfc7159#section-8.1
        // [Windows PowerShell]: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_character_encoding?view=powershell-5.1
        b, n, err := transform.Bytes(transform.Chain(unicode.BOMOverride(transform.Nop), encoding.UTF8Validator), b)
        if err != nil {
                return nil, errors.Wrapf(err, "failed to decode configuration JSON at offset %d", n)
        }
        // Trim whitespace so that an empty config can be detected for an early return.
        b = bytes.TrimSpace(b)

        var config Config
        if len(b) == 0 {
                return &config, nil // early return on empty config
        }

        if flags != nil {
                var jsonConfig map[string]interface{}
                if err := json.Unmarshal(b, &jsonConfig); err != nil {
                        return nil, err
                }

                configSet := configValuesSet(jsonConfig)

                if err := findConfigurationConflicts(configSet, flags); err != nil {
                        return nil, err
                }

                // Override flag values to make sure the values set in the config file with nullable values, like `false`,
                // are not overridden by default truthy values from the flags that were not explicitly set.
                // See https://github.com/docker/docker/issues/20289 for an example.
                //
                // TODO: Rewrite configuration logic to avoid same issue with other nullable values, like numbers.
                namedOptions := make(map[string]interface{})
                for key, value := range configSet {
                        f := flags.Lookup(key)
                        if f == nil { // ignore named flags that don't match
                                namedOptions[key] = value
                                continue
                        }

                        if _, ok := f.Value.(boolValue); ok {
                                f.Value.Set(fmt.Sprintf("%v", value))
                        }
                }
                if len(namedOptions) > 0 {
                        // set also default for mergeVal flags that are boolValue at the same time.
                        flags.VisitAll(func(f *pflag.Flag) {
                                if opt, named := f.Value.(opts.NamedOption); named {
                                        v, set := namedOptions[opt.Name()]
                                        _, boolean := f.Value.(boolValue)
                                        if set && boolean {
                                                f.Value.Set(fmt.Sprintf("%v", v))
                                        }
                                }
                        })
                }

                config.ValuesSet = configSet
        }

        if err := json.Unmarshal(b, &config); err != nil {
                return nil, err
        }

        for _, mc := range migratedNamedConfig {
                mc.migrate(&config)
        }

        return &config, nil
}

// configValuesSet returns the configuration values explicitly set in the file.
func configValuesSet(config map[string]interface{}) map[string]interface{} {
        flatten := make(map[string]interface{})
        for k, v := range config {
                if m, isMap := v.(map[string]interface{}); isMap && !flatOptions[k] {
                        for km, vm := range m {
                                flatten[km] = vm
                        }
                        continue
                }

                flatten[k] = v
        }
        return flatten
}

// findConfigurationConflicts iterates over the provided flags searching for
// duplicated configurations and unknown keys. It returns an error with all the conflicts if
// it finds any.
func findConfigurationConflicts(config map[string]interface{}, flags *pflag.FlagSet) error {
        // 1. Search keys from the file that we don't recognize as flags.
        unknownKeys := make(map[string]interface{})
        for key, value := range config {
                if flag := flags.Lookup(key); flag == nil && !skipValidateOptions[key] {
                        unknownKeys[key] = value
                }
        }

        // 2. Discard values that implement NamedOption.
        // Their configuration name differs from their flag name, like `labels` and `label`.
        if len(unknownKeys) > 0 {
                unknownNamedConflicts := func(f *pflag.Flag) {
                        if namedOption, ok := f.Value.(opts.NamedOption); ok {
                                delete(unknownKeys, namedOption.Name())
                        }
                }
                flags.VisitAll(unknownNamedConflicts)
        }

        if len(unknownKeys) > 0 {
                var unknown []string
                for key := range unknownKeys {
                        unknown = append(unknown, key)
                }
                return errors.Errorf("the following directives don't match any configuration option: %s", strings.Join(unknown, ", "))
        }

        // 3. Search keys that are present as a flag and as a file option.
        printConflict := func(name string, flagValue, fileValue interface{}) string {
                switch name {
                case "http-proxy", "https-proxy":
                        flagValue = MaskCredentials(flagValue.(string))
                        fileValue = MaskCredentials(fileValue.(string))
                }
                return fmt.Sprintf("%s: (from flag: %v, from file: %v)", name, flagValue, fileValue)
        }

        var conflicts []string
        flags.Visit(func(f *pflag.Flag) {
                // search option name in the json configuration payload if the value is a named option
                if namedOption, ok := f.Value.(opts.NamedOption); ok {
                        if optsValue, ok := config[namedOption.Name()]; ok && !skipDuplicates[namedOption.Name()] {
                                conflicts = append(conflicts, printConflict(namedOption.Name(), f.Value.String(), optsValue))
                        }
                } else {
                        // search flag name in the json configuration payload
                        for _, name := range []string{f.Name, f.Shorthand} {
                                if value, ok := config[name]; ok && !skipDuplicates[name] {
                                        conflicts = append(conflicts, printConflict(name, f.Value.String(), value))
                                        break
                                }
                        }
                }
        })

        // 4. Search for options that have been migrated to a NamedOption. These must not
        // be specified using both old and new config file names, or using the original
        // config file name and on the command line. (Or using the new config file name
        // and the command line, but those have already been found by the search above.)
        var errs []error
        for oldName, migration := range migratedNamedConfig {
                oldNameVal, haveOld := config[oldName]
                _, haveNew := config[migration.newName]
                if haveOld {
                        if haveNew {
                                errs = append(errs, fmt.Errorf("%s and %s must not both be specified in the config file", oldName, migration.newName))
                        }
                        if f := flags.Lookup(oldName); f != nil && f.Changed {
                                conflicts = append(conflicts, printConflict(oldName, f.Value.String(), oldNameVal))
                        }
                }
        }

        if len(conflicts) > 0 {
                errs = append(errs, errors.Errorf("the following directives are specified both as a flag and in the configuration file: %s", strings.Join(conflicts, ", ")))
        }
        return stderrors.Join(errs...)
}

// ValidateMinAPIVersion verifies if the given API version is within the
// range supported by the daemon. It is used to validate a custom minimum
// API version set through DOCKER_MIN_API_VERSION.
func ValidateMinAPIVersion(ver string) error {
        if ver == "" {
                return errors.New(`value is empty`)
        }
        if strings.EqualFold(ver[0:1], "v") {
                return errors.New(`API version must be provided without "v" prefix`)
        }
        if versions.LessThan(ver, defaultMinAPIVersion) {
                return errors.Errorf(`minimum supported API version is %s: %s`, defaultMinAPIVersion, ver)
        }
        if versions.GreaterThan(ver, DefaultAPIVersion) {
                return errors.Errorf(`maximum supported API version is %s: %s`, DefaultAPIVersion, ver)
        }
        return nil
}

// Validate validates some specific configs.
// such as config.DNS, config.Labels, config.DNSSearch,
// as well as config.MaxConcurrentDownloads, config.MaxConcurrentUploads and config.MaxDownloadAttempts.
func Validate(config *Config) error {
        // validate log-level
        if config.LogLevel != "" {
                // FIXME(thaJeztah): find a better way for this; this depends on knowledge of containerd's log package internals.
                // Alternatively: try  log.SetLevel(config.LogLevel), and restore the original level, but this also requires internal knowledge.
                switch strings.ToLower(config.LogLevel) {
                case "panic", "fatal", "error", "warn", "info", "debug", "trace":
                        // These are valid. See [log.SetLevel] for a list of accepted levels.
                default:
                        return errors.Errorf("invalid logging level: %s", config.LogLevel)
                }
        }

        // validate log-format
        if logFormat := config.LogFormat; logFormat != "" {
                switch logFormat {
                case log.TextFormat, log.JSONFormat:
                        // These are valid
                default:
                        return errors.Errorf("invalid log format: %s", logFormat)
                }
        }

        // validate DNSSearch
        for _, dnsSearch := range config.DNSSearch {
                if _, err := opts.ValidateDNSSearch(dnsSearch); err != nil {
                        return err
                }
        }

        // validate HostGatewayIPs
        if err := dopts.ValidateHostGatewayIPs(config.HostGatewayIPs); err != nil {
                return err
        }

        // validate Labels
        for _, label := range config.Labels {
                if _, err := opts.ValidateLabel(label); err != nil {
                        return err
                }
        }

        // TODO(thaJeztah) Validations below should not accept "0" to be valid; see Validate() for a more in-depth description of this problem
        if config.MTU < 0 {
                return errors.Errorf("invalid default MTU: %d", config.MTU)
        }
        if config.MaxConcurrentDownloads < 0 {
                return errors.Errorf("invalid max concurrent downloads: %d", config.MaxConcurrentDownloads)
        }
        if config.MaxConcurrentUploads < 0 {
                return errors.Errorf("invalid max concurrent uploads: %d", config.MaxConcurrentUploads)
        }
        if config.MaxDownloadAttempts < 0 {
                return errors.Errorf("invalid max download attempts: %d", config.MaxDownloadAttempts)
        }
        if config.NetworkDiagnosticPort < 0 || config.NetworkDiagnosticPort > 65535 {
                return errors.Errorf("invalid network-diagnostic-port (%d): value must be between 0 and 65535", config.NetworkDiagnosticPort)
        }

        if _, err := ParseGenericResources(config.NodeGenericResources); err != nil {
                return err
        }

        for _, h := range config.Hosts {
                if _, err := opts.ValidateHost(h); err != nil {
                        return err
                }
        }

        for _, mirror := range config.ServiceOptions.Mirrors {
                if _, err := registry.ValidateMirror(mirror); err != nil {
                        return err
                }
        }

        if config.CorsHeaders != "" {
                // TODO(thaJeztah): option is used to produce error when used; remove in next release
                return errors.New(`DEPRECATED: The "api-cors-header" config parameter and the dockerd "--api-cors-header" option have been removed; use a reverse proxy if you need CORS headers`)
        }

        if _, err := parseExecOptions(config.ExecOptions); err != nil {
                return err
        }

        // validate platform-specific settings
        return validatePlatformConfig(config)
}

// parseExecOptions parses the given exec-options into a map. It returns an
// error if the exec-options are formatted incorrectly, or when options are
// used that are not supported on this platform.
//
// TODO(thaJeztah): consider making this more strict: make options case-sensitive and disallow whitespace around "=".
func parseExecOptions(execOptions []string) (map[string]string, error) {
        o := make(map[string]string)
        for _, keyValue := range execOptions {
                k, v, ok := strings.Cut(keyValue, "=")
                k = strings.ToLower(strings.TrimSpace(k))
                v = strings.TrimSpace(v)
                if !ok || k == "" || v == "" {
                        return nil, fmt.Errorf("invalid exec-opt (%s): must be formatted 'opt=value'", keyValue)
                }
                if err := validatePlatformExecOpt(k, v); err != nil {
                        return nil, fmt.Errorf("invalid exec-opt (%s): %w", keyValue, err)
                }
                o[k] = v
        }
        return o, nil
}

// MaskCredentials masks credentials that are in an URL.
func MaskCredentials(rawURL string) string {
        parsedURL, err := url.Parse(rawURL)
        if err != nil || parsedURL.User == nil {
                return rawURL
        }
        parsedURL.User = url.UserPassword("xxxxx", "xxxxx")
        return parsedURL.String()
}

func migrateHostGatewayIP(config *Config) {
        hgip := config.HostGatewayIP //nolint:staticcheck // ignore SA1019: migrating to HostGatewayIPs.
        if hgip != nil {
                addr, _ := netip.AddrFromSlice(hgip)
                config.HostGatewayIPs = []netip.Addr{addr}
                config.HostGatewayIP = nil //nolint:staticcheck // ignore SA1019: clearing old value.
        }
}

// Sanitize sanitizes the config for printing. It is currently limited to
// masking usernames and passwords from Proxy URLs.
func Sanitize(cfg Config) Config {
        cfg.CommonConfig.Proxies = Proxies{
                HTTPProxy:  MaskCredentials(cfg.HTTPProxy),
                HTTPSProxy: MaskCredentials(cfg.HTTPSProxy),
                NoProxy:    MaskCredentials(cfg.NoProxy),
        }
        return cfg
}

package config

import (
        "context"
        "fmt"
        "net"
        "os/exec"
        "path/filepath"
        "strings"

        "github.com/containerd/cgroups/v3"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge"
        "github.com/docker/docker/daemon/pkg/opts"
        "github.com/docker/docker/pkg/homedir"
        "github.com/docker/docker/pkg/rootless"
        "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/system"
        "github.com/pkg/errors"
)

const (
        // DefaultIpcMode is default for container's IpcMode, if not set otherwise
        DefaultIpcMode = container.IPCModePrivate

        // DefaultCgroupNamespaceMode is the default mode for containers cgroup namespace when using cgroups v2.
        DefaultCgroupNamespaceMode = container.CgroupnsModePrivate

        // DefaultCgroupV1NamespaceMode is the default mode for containers cgroup namespace when using cgroups v1.
        DefaultCgroupV1NamespaceMode = container.CgroupnsModeHost

        // StockRuntimeName is the reserved name/alias used to represent the
        // OCI runtime being shipped with the docker daemon package.
        StockRuntimeName = "runc"

        // userlandProxyBinary is the name of the userland-proxy binary.
        userlandProxyBinary = "docker-proxy"
)

// BridgeConfig stores all the parameters for both the bridge driver and the default bridge network.
type BridgeConfig struct {
        DefaultBridgeConfig

        EnableIPTables           bool   `json:"iptables,omitempty"`
        EnableIP6Tables          bool   `json:"ip6tables,omitempty"`
        EnableIPForward          bool   `json:"ip-forward,omitempty"`
        DisableFilterForwardDrop bool   `json:"ip-forward-no-drop,omitempty"`
        EnableIPMasq             bool   `json:"ip-masq,omitempty"`
        EnableUserlandProxy      bool   `json:"userland-proxy,omitempty"`
        UserlandProxyPath        string `json:"userland-proxy-path,omitempty"`
        AllowDirectRouting       bool   `json:"allow-direct-routing,omitempty"`
}

// DefaultBridgeConfig stores all the parameters for the default bridge network.
type DefaultBridgeConfig struct {
        commonBridgeConfig

        // Fields below here are platform specific.
        EnableIPv6                  bool   `json:"ipv6,omitempty"`
        FixedCIDRv6                 string `json:"fixed-cidr-v6,omitempty"`
        MTU                         int    `json:"mtu,omitempty"`
        DefaultIP                   net.IP `json:"ip,omitempty"`
        IP                          string `json:"bip,omitempty"`
        IP6                         string `json:"bip6,omitempty"`
        DefaultGatewayIPv4          net.IP `json:"default-gateway,omitempty"`
        DefaultGatewayIPv6          net.IP `json:"default-gateway-v6,omitempty"`
        InterContainerCommunication bool   `json:"icc,omitempty"`
}

// Config defines the configuration of a docker daemon.
// It includes json tags to deserialize configuration from a file
// using the same names that the flags in the command line uses.
type Config struct {
        CommonConfig

        // Fields below here are platform specific.
        Runtimes             map[string]system.Runtime    `json:"runtimes,omitempty"`
        DefaultInitBinary    string                       `json:"default-init,omitempty"`
        CgroupParent         string                       `json:"cgroup-parent,omitempty"`
        EnableSelinuxSupport bool                         `json:"selinux-enabled,omitempty"`
        RemappedRoot         string                       `json:"userns-remap,omitempty"`
        Ulimits              map[string]*container.Ulimit `json:"default-ulimits,omitempty"`
        CPURealtimePeriod    int64                        `json:"cpu-rt-period,omitempty"`
        CPURealtimeRuntime   int64                        `json:"cpu-rt-runtime,omitempty"`
        Init                 bool                         `json:"init,omitempty"`
        InitPath             string                       `json:"init-path,omitempty"`
        SeccompProfile       string                       `json:"seccomp-profile,omitempty"`
        ShmSize              opts.MemBytes                `json:"default-shm-size,omitempty"`
        NoNewPrivileges      bool                         `json:"no-new-privileges,omitempty"`
        IpcMode              string                       `json:"default-ipc-mode,omitempty"`
        CgroupNamespaceMode  string                       `json:"default-cgroupns-mode,omitempty"`
        // ResolvConf is the path to the configuration of the host resolver
        ResolvConf string `json:"resolv-conf,omitempty"`
        Rootless   bool   `json:"rootless,omitempty"`
}

// GetExecRoot returns the user configured Exec-root
func (conf *Config) GetExecRoot() string {
        return conf.ExecRoot
}

// GetInitPath returns the configured docker-init path
func (conf *Config) GetInitPath() string {
        if conf.InitPath != "" {
                return conf.InitPath
        }
        if conf.DefaultInitBinary != "" {
                return conf.DefaultInitBinary
        }
        return DefaultInitBinary
}

// LookupInitPath returns an absolute path to the "docker-init" binary by searching relevant "libexec" directories (per FHS 3.0 & 2.3) followed by PATH
func (conf *Config) LookupInitPath() (string, error) {
        return lookupBinPath(conf.GetInitPath())
}

// GetResolvConf returns the appropriate resolv.conf
// Check setupResolvConf on how this is selected
func (conf *Config) GetResolvConf() string {
        return conf.ResolvConf
}

// IsSwarmCompatible defines if swarm mode can be enabled in this config
func (conf *Config) IsSwarmCompatible() error {
        if conf.LiveRestoreEnabled {
                return errors.New("--live-restore daemon configuration is incompatible with swarm mode")
        }
        // Swarm has not yet been updated to use nftables. But, if "iptables" is disabled, it
        // doesn't add rules anyway.
        if conf.FirewallBackend == "nftables" && conf.EnableIPTables {
                return errors.New("--firewall-backend=nftables is incompatible with swarm mode")
        }
        return nil
}

// ValidatePlatformConfig checks if any platform-specific configuration settings are invalid.
//
// Deprecated: this function was only used internally and is no longer used. Use [Validate] instead.
func (conf *Config) ValidatePlatformConfig() error {
        return validatePlatformConfig(conf)
}

// IsRootless returns conf.Rootless on Linux but false on Windows
func (conf *Config) IsRootless() bool {
        return conf.Rootless
}

func setPlatformDefaults(cfg *Config) error {
        cfg.Ulimits = make(map[string]*container.Ulimit)
        cfg.ShmSize = opts.MemBytes(DefaultShmSize)
        cfg.SeccompProfile = SeccompProfileDefault
        cfg.IpcMode = string(DefaultIpcMode)
        cfg.Runtimes = make(map[string]system.Runtime)

        if cgroups.Mode() != cgroups.Unified {
                cfg.CgroupNamespaceMode = string(DefaultCgroupV1NamespaceMode)
        } else {
                cfg.CgroupNamespaceMode = string(DefaultCgroupNamespaceMode)
        }

        var err error
        cfg.BridgeConfig.UserlandProxyPath, err = lookupBinPath(userlandProxyBinary)
        if err != nil {
                // Log, but don't error here. This allows running a daemon with
                // userland-proxy disabled (which does not require the binary
                // to be present).
                //
                // An error is still produced by [Config.ValidatePlatformConfig] if
                // userland-proxy is enabled in the configuration.
                //
                // We log this at "debug" level, as this code is also executed
                // when running "--version", and we don't want to print logs in
                // that case..
                log.G(context.TODO()).WithError(err).Debug("failed to lookup default userland-proxy binary")
        }

        if rootless.RunningWithRootlessKit() {
                cfg.Rootless = true

                dataHome, err := homedir.GetDataHome()
                if err != nil {
                        return err
                }
                runtimeDir, err := homedir.GetRuntimeDir()
                if err != nil {
                        return err
                }

                cfg.Root = filepath.Join(dataHome, "docker")
                cfg.ExecRoot = filepath.Join(runtimeDir, "docker")
                cfg.Pidfile = filepath.Join(runtimeDir, "docker.pid")
        } else {
                cfg.Root = "/var/lib/docker"
                cfg.ExecRoot = "/var/run/docker"
                cfg.Pidfile = "/var/run/docker.pid"
        }

        return nil
}

// lookupBinPath returns an absolute path to the provided binary by searching relevant "libexec" locations (per FHS 3.0 & 2.3) followed by PATH
func lookupBinPath(binary string) (string, error) {
        if filepath.IsAbs(binary) {
                return binary, nil
        }

        lookupPaths := []string{
                // FHS 3.0: "/usr/libexec includes internal binaries that are not intended to be executed directly by users or shell scripts. Applications may use a single subdirectory under /usr/libexec."
                // https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch04s07.html
                "/usr/local/libexec/docker",
                "/usr/libexec/docker",

                // FHS 2.3: "/usr/lib includes object files, libraries, and internal binaries that are not intended to be executed directly by users or shell scripts."
                // https://refspecs.linuxfoundation.org/FHS_2.3/fhs-2.3.html#USRLIBLIBRARIESFORPROGRAMMINGANDPA
                "/usr/local/lib/docker",
                "/usr/lib/docker",
        }

        // According to FHS 3.0, it is not necessary to have a subdir here (see note and reference above).
        // If the binary has a `docker-` prefix, let's look it up without the dir prefix.
        if strings.HasPrefix(binary, "docker-") {
                lookupPaths = append(lookupPaths, "/usr/local/libexec")
                lookupPaths = append(lookupPaths, "/usr/libexec")
        }

        for _, dir := range lookupPaths {
                // exec.LookPath has a fast-path short-circuit for paths that contain "/" (skipping the PATH lookup) that then verifies whether the given path is likely to be an actual executable binary (so we invoke that instead of reimplementing the same checks)
                if file, err := exec.LookPath(filepath.Join(dir, binary)); err == nil {
                        return file, nil
                }
        }

        // if we checked all the "libexec" directories and found no matches, fall back to PATH
        return exec.LookPath(binary)
}

// validatePlatformConfig checks if any platform-specific configuration settings are invalid.
func validatePlatformConfig(conf *Config) error {
        if err := verifyUserlandProxyConfig(conf); err != nil {
                return err
        }
        if err := verifyDefaultIpcMode(conf.IpcMode); err != nil {
                return err
        }

        if err := bridge.ValidateFixedCIDRV6(conf.FixedCIDRv6); err != nil {
                return errors.Wrap(err, "invalid fixed-cidr-v6")
        }

        if err := validateFirewallBackend(conf.FirewallBackend); err != nil {
                return errors.Wrap(err, "invalid firewall-backend")
        }

        return verifyDefaultCgroupNsMode(conf.CgroupNamespaceMode)
}

// validatePlatformExecOpt validates if the given exec-opt and value are valid
// for the current platform.
func validatePlatformExecOpt(opt, value string) error {
        switch opt {
        case "isolation":
                return fmt.Errorf("option '%s' is only supported on windows", opt)
        case "native.cgroupdriver":
                // TODO(thaJeztah): add validation that's currently in daemon.verifyCgroupDriver
                return nil
        default:
                return fmt.Errorf("unknown option: '%s'", opt)
        }
}

// verifyUserlandProxyConfig verifies if a valid userland-proxy path
// is configured if userland-proxy is enabled.
func verifyUserlandProxyConfig(conf *Config) error {
        if !conf.EnableUserlandProxy {
                return nil
        }
        if conf.UserlandProxyPath == "" {
                return errors.New("invalid userland-proxy-path: userland-proxy is enabled, but userland-proxy-path is not set")
        }
        if !filepath.IsAbs(conf.UserlandProxyPath) {
                return errors.New("invalid userland-proxy-path: must be an absolute path: " + conf.UserlandProxyPath)
        }
        // Using exec.LookPath here, because it also produces an error if the
        // given path is not a valid executable or a directory.
        if _, err := exec.LookPath(conf.UserlandProxyPath); err != nil {
                return errors.Wrap(err, "invalid userland-proxy-path")
        }

        return nil
}

func verifyDefaultIpcMode(mode string) error {
        const hint = `use "shareable" or "private"`

        dm := container.IpcMode(mode)
        if !dm.Valid() {
                return fmt.Errorf("default IPC mode setting (%v) is invalid; "+hint, dm)
        }
        if dm != "" && !dm.IsPrivate() && !dm.IsShareable() {
                return fmt.Errorf(`IPC mode "%v" is not supported as default value; `+hint, dm)
        }
        return nil
}

func validateFirewallBackend(val string) error {
        switch val {
        case "", "iptables", "nftables":
                return nil
        }
        return errors.New(`allowed values are "iptables" and "nftables"`)
}

func verifyDefaultCgroupNsMode(mode string) error {
        cm := container.CgroupnsMode(mode)
        if !cm.Valid() {
                return fmt.Errorf(`invalid default cgroup namespace (%v): use "host" or "private"`, cm)
        }

        return nil
}

package config

import (
        "github.com/docker/docker/daemon/cluster/convert"
        "github.com/moby/moby/api/types/swarm"
        "github.com/moby/swarmkit/v2/api/genericresource"
)

// ParseGenericResources parses and validates the specified string as a list of GenericResource
func ParseGenericResources(value []string) ([]swarm.GenericResource, error) {
        if len(value) == 0 {
                return nil, nil
        }

        resources, err := genericresource.Parse(value)
        if err != nil {
                return nil, err
        }

        obj := convert.GenericResourcesFromGRPC(resources)
        return obj, nil
}

package container

import (
        "context"
        "sync"
)

// attachContext is the context used for attach calls.
type attachContext struct {
        mu         sync.Mutex
        ctx        context.Context
        cancelFunc context.CancelFunc
}

// init returns the context for attach calls. It creates a new context
// if no context is created yet.
func (ac *attachContext) init() context.Context {
        ac.mu.Lock()
        defer ac.mu.Unlock()
        if ac.ctx == nil {
                ac.ctx, ac.cancelFunc = context.WithCancel(context.Background())
        }
        return ac.ctx
}

// cancelFunc cancels the attachContext. All attach calls should detach
// after this call.
func (ac *attachContext) cancel() {
        ac.mu.Lock()
        if ac.ctx != nil {
                ac.cancelFunc()
                ac.ctx = nil
        }
        ac.mu.Unlock()
}

package container

import (
        "bytes"
        "context"
        "encoding/json"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "runtime"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/containerd/v2/pkg/cio"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        libcontainerdtypes "github.com/docker/docker/daemon/internal/libcontainerd/types"
        "github.com/docker/docker/daemon/internal/restartmanager"
        "github.com/docker/docker/daemon/internal/stream"
        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/jsonfilelog"
        "github.com/docker/docker/daemon/logger/local"
        "github.com/docker/docker/daemon/logger/loggerutils/cache"
        "github.com/docker/docker/daemon/network"
        "github.com/docker/docker/daemon/volume"
        volumemounts "github.com/docker/docker/daemon/volume/mounts"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/image"
        "github.com/docker/docker/oci"
        "github.com/docker/go-units"
        containertypes "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/events"
        mounttypes "github.com/moby/moby/api/types/mount"
        swarmtypes "github.com/moby/moby/api/types/swarm"
        agentexec "github.com/moby/swarmkit/v2/agent/exec"
        "github.com/moby/sys/atomicwriter"
        "github.com/moby/sys/signal"
        "github.com/moby/sys/symlink"
        "github.com/moby/sys/user"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

const (
        configFileName     = "config.v2.json"
        hostConfigFileName = "hostconfig.json"

        // defaultStopSignal is the default syscall signal used to stop a container.
        defaultStopSignal = syscall.SIGTERM
)

// ExitStatus provides exit reasons for a container.
type ExitStatus struct {
        // The exit code with which the container exited.
        ExitCode int

        // Time at which the container died
        ExitedAt time.Time
}

// Container holds the structure defining a container object.
type Container struct {
        StreamConfig *stream.Config
        // We embed [State] here so that Container supports states directly,
        // but marshal it as a struct in JSON.
        //
        // State also provides a [sync.Mutex] which is used as lock for both
        // the Container and State.
        *State          `json:"State"`
        Root            string  `json:"-"` // Path to the "home" of the container, including metadata.
        BaseFS          string  `json:"-"` // Path to the graphdriver mountpoint
        RWLayer         RWLayer `json:"-"`
        ID              string
        Created         time.Time
        Managed         bool
        Path            string
        Args            []string
        Config          *containertypes.Config
        ImageID         image.ID `json:"Image"`
        ImageManifest   *ocispec.Descriptor
        NetworkSettings *network.Settings
        LogPath         string
        Name            string
        Driver          string

        // Deprecated: use [ImagePlatform.OS] instead.
        // TODO: Remove, see https://github.com/moby/moby/issues/48892
        OS string

        ImagePlatform ocispec.Platform

        RestartCount             int
        HasBeenStartedBefore     bool
        HasBeenManuallyStopped   bool // used for unless-stopped restart policy
        HasBeenManuallyRestarted bool `json:"-"` // used to distinguish restart caused by restart policy from the manual one
        MountPoints              map[string]*volumemounts.MountPoint
        HostConfig               *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
        ExecCommands             *ExecStore                 `json:"-"`
        DependencyStore          agentexec.DependencyGetter `json:"-"`
        SecretReferences         []*swarmtypes.SecretReference
        ConfigReferences         []*swarmtypes.ConfigReference
        // logDriver for closing
        LogDriver      logger.Logger  `json:"-"`
        LogCopier      *logger.Copier `json:"-"`
        restartManager *restartmanager.RestartManager
        attachContext  *attachContext

        // Fields here are specific to Unix platforms
        SecurityOptions
        HostnamePath   string
        HostsPath      string
        ShmPath        string
        ResolvConfPath string

        // Fields here are specific to Windows
        NetworkSharedContainerID string            `json:"-"`
        SharedEndpointList       []string          `json:"-"`
        LocalLogCacheMeta        localLogCacheMeta `json:",omitempty"`
}

type SecurityOptions struct {
        // MountLabel contains the options for the "mount" command.
        MountLabel      string
        ProcessLabel    string
        AppArmorProfile string
        SeccompProfile  string
        NoNewPrivileges bool
        WritableCgroups *bool
}

type localLogCacheMeta struct {
        HaveNotifyEnabled bool
}

// NewBaseContainer creates a new container with its
// basic configuration.
func NewBaseContainer(id, root string) *Container {
        return &Container{
                ID:            id,
                State:         NewState(),
                ExecCommands:  NewExecStore(),
                Root:          root,
                MountPoints:   make(map[string]*volumemounts.MountPoint),
                StreamConfig:  stream.NewConfig(),
                attachContext: &attachContext{},
        }
}

// FromDisk loads the container configuration stored in the host.
func (container *Container) FromDisk() error {
        pth, err := container.ConfigPath()
        if err != nil {
                return err
        }

        jsonSource, err := os.Open(pth)
        if err != nil {
                return err
        }
        defer jsonSource.Close()

        dec := json.NewDecoder(jsonSource)

        // Load container settings
        if err := dec.Decode(container); err != nil {
                return err
        }

        if container.OS != "" {
                // OS was deprecated in favor of ImagePlatform
                // Make sure we migrate the OS to ImagePlatform.OS.
                if container.ImagePlatform.OS == "" {
                        container.ImagePlatform.OS = container.OS //nolint:staticcheck // ignore SA1019: field is deprecated
                }
        } else {
                // Pre multiple-OS support containers have no OS set.
                // Assume it is the host platform.
                container.ImagePlatform = platforms.DefaultSpec()
                container.OS = container.ImagePlatform.OS //nolint:staticcheck // ignore SA1019: field is deprecated
        }

        return container.readHostConfig()
}

// toDisk writes the container's configuration (config.v2.json, hostconfig.json)
// to disk and returns a deep copy.
func (container *Container) toDisk() (*Container, error) {
        pth, err := container.ConfigPath()
        if err != nil {
                return nil, err
        }

        // Save container settings
        f, err := atomicwriter.New(pth, 0o600)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        var buf bytes.Buffer
        w := io.MultiWriter(&buf, f)
        if err := json.NewEncoder(w).Encode(container); err != nil {
                return nil, err
        }

        var deepCopy Container
        if err := json.NewDecoder(&buf).Decode(&deepCopy); err != nil {
                return nil, err
        }
        deepCopy.HostConfig, err = container.WriteHostConfig()
        if err != nil {
                return nil, err
        }
        return &deepCopy, nil
}

// CheckpointTo makes the Container's current state visible to queries, and persists state.
// Callers must hold a Container lock.
func (container *Container) CheckpointTo(ctx context.Context, store *ViewDB) error {
        ctx, span := otel.Tracer("").Start(ctx, "container.CheckpointTo", trace.WithAttributes(
                attribute.String("container.ID", container.ID),
                attribute.String("container.Name", container.Name)))
        defer span.End()

        deepCopy, err := container.toDisk()
        if err != nil {
                return err
        }
        return store.Save(deepCopy)
}

// readHostConfig reads the host configuration from disk for the container.
func (container *Container) readHostConfig() error {
        container.HostConfig = &containertypes.HostConfig{}
        // If the hostconfig file does not exist, do not read it.
        // (We still have to initialize container.HostConfig,
        // but that's OK, since we just did that above.)
        pth, err := container.HostConfigPath()
        if err != nil {
                return err
        }

        f, err := os.Open(pth)
        if err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return err
        }
        defer f.Close()

        if err := json.NewDecoder(f).Decode(&container.HostConfig); err != nil {
                return err
        }

        container.InitDNSHostConfig()

        return nil
}

// WriteHostConfig saves the host configuration on disk for the container,
// and returns a deep copy of the saved object. Callers must hold a Container lock.
func (container *Container) WriteHostConfig() (*containertypes.HostConfig, error) {
        var (
                buf      bytes.Buffer
                deepCopy containertypes.HostConfig
        )

        pth, err := container.HostConfigPath()
        if err != nil {
                return nil, err
        }

        f, err := atomicwriter.New(pth, 0o600)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        w := io.MultiWriter(&buf, f)
        if err := json.NewEncoder(w).Encode(&container.HostConfig); err != nil {
                return nil, err
        }

        if err := json.NewDecoder(&buf).Decode(&deepCopy); err != nil {
                return nil, err
        }
        return &deepCopy, nil
}

// CommitInMemory makes the Container's current state visible to queries,
// but does not persist state.
//
// Callers must hold a Container lock.
func (container *Container) CommitInMemory(store *ViewDB) error {
        var buf bytes.Buffer
        if err := json.NewEncoder(&buf).Encode(container); err != nil {
                return err
        }

        var deepCopy Container
        if err := json.NewDecoder(&buf).Decode(&deepCopy); err != nil {
                return err
        }

        buf.Reset()
        if err := json.NewEncoder(&buf).Encode(container.HostConfig); err != nil {
                return err
        }
        if err := json.NewDecoder(&buf).Decode(&deepCopy.HostConfig); err != nil {
                return err
        }

        return store.Save(&deepCopy)
}

// SetupWorkingDirectory sets up the container's working directory as set in container.Config.WorkingDir
func (container *Container) SetupWorkingDirectory(uid int, gid int) error {
        if container.Config.WorkingDir == "" {
                return nil
        }

        workdir := filepath.Clean(container.Config.WorkingDir)
        pth, err := container.GetResourcePath(workdir)
        if err != nil {
                return err
        }

        if err := user.MkdirAllAndChown(pth, 0o755, uid, gid, user.WithOnlyNew); err != nil {
                pthInfo, err2 := os.Stat(pth)
                if err2 == nil && pthInfo != nil && !pthInfo.IsDir() {
                        return errors.Errorf("Cannot mkdir: %s is not a directory", container.Config.WorkingDir)
                }

                return err
        }

        return nil
}

// GetResourcePath evaluates `path` in the scope of the container's BaseFS, with proper path
// sanitization. Symlinks are all scoped to the BaseFS of the container, as
// though the container's BaseFS was `/`.
//
// The BaseFS of a container is the host-facing path which is bind-mounted as
// `/` inside the container. This method is essentially used to access a
// particular path inside the container as though you were a process in that
// container.
//
// # NOTE
// The returned path is *only* safely scoped inside the container's BaseFS
// if no component of the returned path changes (such as a component
// symlinking to a different path) between using this method and using the
// path. See symlink.FollowSymlinkInScope for more details.
func (container *Container) GetResourcePath(path string) (string, error) {
        if container.BaseFS == "" {
                return "", errors.New("GetResourcePath: BaseFS of container " + container.ID + " is unexpectedly empty")
        }
        // IMPORTANT - These are paths on the OS where the daemon is running, hence
        // any filepath operations must be done in an OS-agnostic way.
        r, e := symlink.FollowSymlinkInScope(filepath.Join(container.BaseFS, cleanScopedPath(path)), container.BaseFS)

        // Log this here on the daemon side as there's otherwise no indication apart
        // from the error being propagated all the way back to the client. This makes
        // debugging significantly easier and clearly indicates the error comes from the daemon.
        if e != nil {
                log.G(context.TODO()).Errorf("Failed to ResolveScopedPath BaseFS %s path %s %s\n", container.BaseFS, path, e)
        }
        return r, e
}

// cleanScopedPath prepares the given path to be combined with a mount path or
// a drive-letter. On Windows, it removes any existing driveletter (e.g. "C:").
// The returned path is always prefixed with a [filepath.Separator].
func cleanScopedPath(path string) string {
        if len(path) >= 2 {
                if v := filepath.VolumeName(path); v != "" {
                        path = path[len(v):]
                }
        }
        return filepath.Join(string(filepath.Separator), path)
}

// GetRootResourcePath evaluates `path` in the scope of the container's root, with proper path
// sanitization. Symlinks are all scoped to the root of the container, as
// though the container's root was `/`.
//
// The root of a container is the host-facing configuration metadata directory.
// Only use this method to safely access the container's `container.json` or
// other metadata files. If in doubt, use container.GetResourcePath.
//
// # NOTE
// The returned path is *only* safely scoped inside the container's root
// if no component of the returned path changes (such as a component
// symlinking to a different path) between using this method and using the
// path. See symlink.FollowSymlinkInScope for more details.
func (container *Container) GetRootResourcePath(path string) (string, error) {
        // IMPORTANT - These are paths on the OS where the daemon is running, hence
        // any filepath operations must be done in an OS agnostic way.
        cleanPath := filepath.Join(string(os.PathSeparator), path)
        return symlink.FollowSymlinkInScope(filepath.Join(container.Root, cleanPath), container.Root)
}

// ExitOnNext signals to the monitor that it should not restart the container
// after we send the kill signal.
func (container *Container) ExitOnNext() {
        container.RestartManager().Cancel()
}

// HostConfigPath returns the path to the container's JSON hostconfig
func (container *Container) HostConfigPath() (string, error) {
        return container.GetRootResourcePath(hostConfigFileName)
}

// ConfigPath returns the path to the container's JSON config
func (container *Container) ConfigPath() (string, error) {
        return container.GetRootResourcePath(configFileName)
}

// CheckpointDir returns the directory checkpoints are stored in
func (container *Container) CheckpointDir() string {
        return filepath.Join(container.Root, "checkpoints")
}

// StartLogger starts a new logger driver for the container.
func (container *Container) StartLogger() (logger.Logger, error) {
        cfg := container.HostConfig.LogConfig
        initDriver, err := logger.GetLogDriver(cfg.Type)
        if err != nil {
                return nil, errors.Wrap(err, "failed to get logging factory")
        }
        info := logger.Info{
                Config:              cfg.Config,
                ContainerID:         container.ID,
                ContainerName:       container.Name,
                ContainerEntrypoint: container.Path,
                ContainerArgs:       container.Args,
                ContainerImageID:    container.ImageID.String(),
                ContainerImageName:  container.Config.Image,
                ContainerCreated:    container.Created,
                ContainerEnv:        container.Config.Env,
                ContainerLabels:     container.Config.Labels,
                DaemonName:          "docker",
        }

        // Set logging file for "json-logger"
        // TODO(@cpuguy83): Setup here based on log driver is a little weird.
        switch cfg.Type {
        case jsonfilelog.Name:
                info.LogPath, err = container.GetRootResourcePath(fmt.Sprintf("%s-json.log", container.ID))
                if err != nil {
                        return nil, err
                }

                container.LogPath = info.LogPath
        case local.Name:
                // Do not set container.LogPath for the local driver
                // This would expose the value to the API, which should not be done as it means
                // that the log file implementation would become a stable API that cannot change.
                logDir, err := container.GetRootResourcePath("local-logs")
                if err != nil {
                        return nil, err
                }
                if err := os.MkdirAll(logDir, 0o700); err != nil {
                        return nil, errdefs.System(errors.Wrap(err, "error creating local logs dir"))
                }
                info.LogPath = filepath.Join(logDir, "container.log")
        }

        l, err := initDriver(info)
        if err != nil {
                return nil, err
        }

        if containertypes.LogMode(cfg.Config["mode"]) == containertypes.LogModeNonBlock {
                bufferSize := int64(-1)
                if s, exists := cfg.Config["max-buffer-size"]; exists {
                        bufferSize, err = units.RAMInBytes(s)
                        if err != nil {
                                return nil, err
                        }
                }
                l = logger.NewRingLogger(l, info, bufferSize)
        }

        if _, ok := l.(logger.LogReader); !ok {
                if cache.ShouldUseCache(cfg.Config) {
                        logPath, err := container.GetRootResourcePath("container-cached.log")
                        if err != nil {
                                return nil, err
                        }

                        if !container.LocalLogCacheMeta.HaveNotifyEnabled {
                                log.G(context.TODO()).WithField("container", container.ID).WithField("driver", container.HostConfig.LogConfig.Type).Info("Configured log driver does not support reads, enabling local file cache for container logs")
                                container.LocalLogCacheMeta.HaveNotifyEnabled = true
                        }
                        info.LogPath = logPath
                        l, err = cache.WithLocalCache(l, info)
                        if err != nil {
                                return nil, errors.Wrap(err, "error setting up local container log cache")
                        }
                }
        }
        return l, nil
}

// GetProcessLabel returns the process label for the container.
func (container *Container) GetProcessLabel() string {
        // even if we have a process label return "" if we are running
        // in privileged mode
        if container.HostConfig.Privileged {
                return ""
        }
        return container.ProcessLabel
}

// GetMountLabel returns the mounting label for the container.
// This label is empty if the container is privileged.
func (container *Container) GetMountLabel() string {
        return container.MountLabel
}

// GetExecIDs returns the list of exec commands running on the container.
func (container *Container) GetExecIDs() []string {
        return container.ExecCommands.List()
}

// ShouldRestart decides whether the daemon should restart the container or not.
// This is based on the container's restart policy.
func (container *Container) ShouldRestart() bool {
        shouldRestart, _, _ := container.RestartManager().ShouldRestart(uint32(container.ExitCode()), container.HasBeenManuallyStopped, container.FinishedAt.Sub(container.StartedAt))
        return shouldRestart
}

// AddMountPointWithVolume adds a new mount point configured with a volume to the container.
func (container *Container) AddMountPointWithVolume(destination string, vol volume.Volume, rw bool) {
        volumeParser := volumemounts.NewParser()
        container.MountPoints[destination] = &volumemounts.MountPoint{
                Type:        mounttypes.TypeVolume,
                Name:        vol.Name(),
                Driver:      vol.DriverName(),
                Destination: destination,
                RW:          rw,
                Volume:      vol,
                CopyData:    volumeParser.DefaultCopyMode(),
        }
}

// UnmountVolumes unmounts all volumes
func (container *Container) UnmountVolumes(ctx context.Context, volumeEventLog func(name string, action events.Action, attributes map[string]string)) error {
        var errs []string
        for _, volumeMount := range container.MountPoints {
                if volumeMount.Volume == nil {
                        continue
                }

                if err := volumeMount.Cleanup(ctx); err != nil {
                        errs = append(errs, err.Error())
                        continue
                }
                volumeEventLog(volumeMount.Volume.Name(), events.ActionUnmount, map[string]string{
                        "driver":    volumeMount.Volume.DriverName(),
                        "container": container.ID,
                })
        }
        if len(errs) > 0 {
                return fmt.Errorf("error while unmounting volumes for container %s: %s", container.ID, strings.Join(errs, "; "))
        }
        return nil
}

// IsDestinationMounted checks whether a path is mounted on the container or not.
func (container *Container) IsDestinationMounted(destination string) bool {
        return container.MountPoints[destination] != nil
}

// StopSignal returns the signal used to stop the container.
func (container *Container) StopSignal() syscall.Signal {
        stopSignal := defaultStopSignal
        if container.Config.StopSignal != "" {
                // signal.ParseSignal returns "-1" for invalid or unknown signals.
                sig, err := signal.ParseSignal(container.Config.StopSignal)
                if err == nil && sig > 0 {
                        stopSignal = sig
                }
        }
        return stopSignal
}

// StopTimeout returns the timeout (in seconds) used to stop the container.
func (container *Container) StopTimeout() int {
        if container.Config.StopTimeout != nil {
                return *container.Config.StopTimeout
        }
        return defaultStopTimeout
}

// InitDNSHostConfig ensures that the dns fields are never nil.
// New containers don't ever have those fields nil,
// but pre created containers can still have those nil values.
// The non-recommended host configuration in the start api can
// make these fields nil again, this corrects that issue until
// we remove that behavior for good.
// See https://github.com/docker/docker/pull/17779
// for a more detailed explanation on why we don't want that.
func (container *Container) InitDNSHostConfig() {
        container.Lock()
        defer container.Unlock()
        if container.HostConfig.DNS == nil {
                container.HostConfig.DNS = make([]string, 0)
        }

        if container.HostConfig.DNSSearch == nil {
                container.HostConfig.DNSSearch = make([]string, 0)
        }

        if container.HostConfig.DNSOptions == nil {
                container.HostConfig.DNSOptions = make([]string, 0)
        }
}

// UpdateMonitor updates monitor configure for running container
func (container *Container) UpdateMonitor(restartPolicy containertypes.RestartPolicy) {
        container.RestartManager().SetPolicy(restartPolicy)
}

// FullHostname returns hostname and optional domain appended to it.
func (container *Container) FullHostname() string {
        fullHostname := container.Config.Hostname
        if container.Config.Domainname != "" {
                fullHostname = fmt.Sprintf("%s.%s", fullHostname, container.Config.Domainname)
        }
        return fullHostname
}

// RestartManager returns the current restartmanager instance connected to container.
func (container *Container) RestartManager() *restartmanager.RestartManager {
        if container.restartManager == nil {
                container.restartManager = restartmanager.New(container.HostConfig.RestartPolicy, container.RestartCount)
        }
        return container.restartManager
}

// ResetRestartManager initializes new restartmanager based on container config
func (container *Container) ResetRestartManager(resetCount bool) {
        if container.restartManager != nil {
                container.restartManager.Cancel()
        }
        if resetCount {
                container.RestartCount = 0
        }
        container.restartManager = nil
}

// AttachContext returns the context for attach calls to track container liveness.
func (container *Container) AttachContext() context.Context {
        return container.attachContext.init()
}

// CancelAttachContext cancels attach context. All attach calls should detach
// after this call.
func (container *Container) CancelAttachContext() {
        container.attachContext.cancel()
}

func (container *Container) startLogging() error {
        if container.HostConfig.LogConfig.Type == "none" {
                return nil // do not start logging routines
        }

        l, err := container.StartLogger()
        if err != nil {
                return fmt.Errorf("failed to initialize logging driver: %v", err)
        }

        copier := logger.NewCopier(map[string]io.Reader{"stdout": container.StdoutPipe(), "stderr": container.StderrPipe()}, l)
        container.LogCopier = copier
        copier.Run()
        container.LogDriver = l

        return nil
}

// StdinPipe gets the stdin stream of the container
func (container *Container) StdinPipe() io.WriteCloser {
        return container.StreamConfig.StdinPipe()
}

// StdoutPipe gets the stdout stream of the container
func (container *Container) StdoutPipe() io.ReadCloser {
        return container.StreamConfig.StdoutPipe()
}

// StderrPipe gets the stderr stream of the container
func (container *Container) StderrPipe() io.ReadCloser {
        return container.StreamConfig.StderrPipe()
}

// CloseStreams closes the container's stdio streams
func (container *Container) CloseStreams() error {
        return container.StreamConfig.CloseStreams()
}

// InitializeStdio is called by libcontainerd to connect the stdio.
func (container *Container) InitializeStdio(iop *cio.DirectIO) (cio.IO, error) {
        if err := container.startLogging(); err != nil {
                container.Reset(false)
                return nil, err
        }

        container.StreamConfig.CopyToPipe(iop)

        if container.StreamConfig.Stdin() == nil && !container.Config.Tty {
                if iop.Stdin != nil {
                        if err := iop.Stdin.Close(); err != nil {
                                log.G(context.TODO()).Warnf("error closing stdin: %+v", err)
                        }
                }
        }

        return &rio{IO: iop, sc: container.StreamConfig}, nil
}

// MountsResourcePath returns the path where mounts are stored for the given mount
func (container *Container) MountsResourcePath(mount string) (string, error) {
        return container.GetRootResourcePath(filepath.Join("mounts", mount))
}

// SecretMountPath returns the path of the secret mount for the container
func (container *Container) SecretMountPath() (string, error) {
        return container.MountsResourcePath("secrets")
}

// SecretFilePath returns the path to the location of a secret on the host.
func (container *Container) SecretFilePath(secretRef swarmtypes.SecretReference) (string, error) {
        secrets, err := container.SecretMountPath()
        if err != nil {
                return "", err
        }
        return filepath.Join(secrets, secretRef.SecretID), nil
}

func getSecretTargetPath(r *swarmtypes.SecretReference) string {
        if filepath.IsAbs(r.File.Name) {
                return r.File.Name
        }

        return filepath.Join(containerSecretMountPath, r.File.Name)
}

// getConfigTargetPath makes sure that config paths inside the container are
// absolute, as required by the runtime spec, and enforced by runc >= 1.0.0-rc94.
// see https://github.com/opencontainers/runc/issues/2928
func getConfigTargetPath(r *swarmtypes.ConfigReference) string {
        if filepath.IsAbs(r.File.Name) {
                return r.File.Name
        }

        return filepath.Join(containerConfigMountPath, r.File.Name)
}

// CreateDaemonEnvironment creates a new environment variable slice for this container.
func (container *Container) CreateDaemonEnvironment(tty bool, linkedEnv []string) []string {
        // Setup environment
        ctrOS := container.ImagePlatform.OS
        if ctrOS == "" {
                ctrOS = runtime.GOOS
        }

        // Figure out what size slice we need so we can allocate this all at once.
        envSize := len(container.Config.Env)
        if runtime.GOOS != "windows" {
                envSize += 2 + len(linkedEnv)
        }
        if tty {
                envSize++
        }

        env := make([]string, 0, envSize)
        if runtime.GOOS != "windows" {
                env = append(env, "PATH="+oci.DefaultPathEnv(ctrOS))
                env = append(env, "HOSTNAME="+container.Config.Hostname)
                if tty {
                        env = append(env, "TERM=xterm")
                }
                env = append(env, linkedEnv...)
        }

        // because the env on the container can override certain default values
        // we need to replace the 'env' keys where they match and append anything
        // else.
        env = ReplaceOrAppendEnvValues(env, container.Config.Env)
        return env
}

// RestoreTask restores the containerd container and task handles and reattaches
// the IO for the running task. Container state is not synced with containerd's
// state.
//
// An errdefs.NotFound error is returned if the container does not exist in
// containerd. However, a nil error is returned if the task does not exist in
// containerd.
func (container *Container) RestoreTask(ctx context.Context, client libcontainerdtypes.Client) error {
        container.Lock()
        defer container.Unlock()
        var err error
        container.ctr, err = client.LoadContainer(ctx, container.ID)
        if err != nil {
                return err
        }
        container.task, err = container.ctr.AttachTask(ctx, container.InitializeStdio)
        if err != nil && !cerrdefs.IsNotFound(err) {
                return err
        }
        return nil
}

// GetRunningTask asserts that the container is running and returns the Task for
// the container. An errdefs.Conflict error is returned if the container is not
// in the Running state.
//
// A system error is returned if container is in a bad state: Running is true
// but has a nil Task.
//
// The container lock must be held when calling this method.
func (container *Container) GetRunningTask() (libcontainerdtypes.Task, error) {
        if !container.Running {
                return nil, errdefs.Conflict(fmt.Errorf("container %s is not running", container.ID))
        }
        tsk, ok := container.Task()
        if !ok {
                return nil, errdefs.System(errors.WithStack(fmt.Errorf("container %s is in Running state but has no containerd Task set", container.ID)))
        }
        return tsk, nil
}

type rio struct {
        cio.IO

        sc *stream.Config
}

func (i *rio) Close() error {
        i.IO.Close()

        return i.sc.CloseStreams()
}

func (i *rio) Wait() {
        i.sc.Wait(context.Background())

        i.IO.Wait()
}

type conflictingUpdateOptions string

func (e conflictingUpdateOptions) Error() string {
        return string(e)
}

func (e conflictingUpdateOptions) Conflict() {}

//go:build !windows

package container

import (
        "context"
        "os"
        "path/filepath"
        "syscall"

        "github.com/containerd/continuity/fs"
        "github.com/containerd/log"
        volumemounts "github.com/docker/docker/daemon/volume/mounts"
        containertypes "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/events"
        mounttypes "github.com/moby/moby/api/types/mount"
        swarmtypes "github.com/moby/moby/api/types/swarm"
        "github.com/moby/sys/mount"
        "github.com/opencontainers/selinux/go-selinux/label"
        "github.com/pkg/errors"
)

const (
        // defaultStopTimeout sets the default time, in seconds, to wait
        // for the graceful container stop before forcefully terminating it.
        defaultStopTimeout = 10

        containerConfigMountPath = "/"
        containerSecretMountPath = "/run/secrets"
)

// TrySetNetworkMount attempts to set the network mounts given a provided destination and
// the path to use for it; return true if the given destination was a network mount file
func (container *Container) TrySetNetworkMount(destination string, path string) bool {
        if destination == "/etc/resolv.conf" {
                container.ResolvConfPath = path
                return true
        }
        if destination == "/etc/hostname" {
                container.HostnamePath = path
                return true
        }
        if destination == "/etc/hosts" {
                container.HostsPath = path
                return true
        }

        return false
}

// BuildHostnameFile writes the container's hostname file.
func (container *Container) BuildHostnameFile() error {
        hostnamePath, err := container.GetRootResourcePath("hostname")
        if err != nil {
                return err
        }
        container.HostnamePath = hostnamePath
        return os.WriteFile(container.HostnamePath, []byte(container.Config.Hostname+"\n"), 0o644)
}

// NetworkMounts returns the list of network mounts.
func (container *Container) NetworkMounts() []Mount {
        ctx := context.TODO()

        var mounts []Mount
        shared := container.HostConfig.NetworkMode.IsContainer()
        parser := volumemounts.NewParser()
        if container.ResolvConfPath != "" {
                if _, err := os.Stat(container.ResolvConfPath); err != nil {
                        log.G(ctx).Warnf("ResolvConfPath set to %q, but can't stat this filename (err = %v); skipping", container.ResolvConfPath, err)
                } else {
                        writable := !container.HostConfig.ReadonlyRootfs
                        if m, exists := container.MountPoints["/etc/resolv.conf"]; exists {
                                writable = m.RW
                        } else {
                                label.Relabel(container.ResolvConfPath, container.MountLabel, shared)
                        }
                        mounts = append(mounts, Mount{
                                Source:      container.ResolvConfPath,
                                Destination: "/etc/resolv.conf",
                                Writable:    writable,
                                Propagation: string(parser.DefaultPropagationMode()),
                        })
                }
        }
        if container.HostnamePath != "" {
                if _, err := os.Stat(container.HostnamePath); err != nil {
                        log.G(ctx).Warnf("HostnamePath set to %q, but can't stat this filename (err = %v); skipping", container.HostnamePath, err)
                } else {
                        writable := !container.HostConfig.ReadonlyRootfs
                        if m, exists := container.MountPoints["/etc/hostname"]; exists {
                                writable = m.RW
                        } else {
                                label.Relabel(container.HostnamePath, container.MountLabel, shared)
                        }
                        mounts = append(mounts, Mount{
                                Source:      container.HostnamePath,
                                Destination: "/etc/hostname",
                                Writable:    writable,
                                Propagation: string(parser.DefaultPropagationMode()),
                        })
                }
        }
        if container.HostsPath != "" {
                if _, err := os.Stat(container.HostsPath); err != nil {
                        log.G(ctx).Warnf("HostsPath set to %q, but can't stat this filename (err = %v); skipping", container.HostsPath, err)
                } else {
                        writable := !container.HostConfig.ReadonlyRootfs
                        if m, exists := container.MountPoints["/etc/hosts"]; exists {
                                writable = m.RW
                        } else {
                                label.Relabel(container.HostsPath, container.MountLabel, shared)
                        }
                        mounts = append(mounts, Mount{
                                Source:      container.HostsPath,
                                Destination: "/etc/hosts",
                                Writable:    writable,
                                Propagation: string(parser.DefaultPropagationMode()),
                        })
                }
        }
        return mounts
}

// CopyImagePathContent copies files in destination to the volume.
func (container *Container) CopyImagePathContent(volumePath, destination string) error {
        if err := label.Relabel(volumePath, container.MountLabel, true); err != nil && !errors.Is(err, syscall.ENOTSUP) {
                return err
        }
        return copyExistingContents(destination, volumePath)
}

// ShmResourcePath returns path to shm
func (container *Container) ShmResourcePath() (string, error) {
        return container.MountsResourcePath("shm")
}

// HasMountFor checks if path is a mountpoint
func (container *Container) HasMountFor(path string) bool {
        _, exists := container.MountPoints[path]
        if exists {
                return true
        }

        // Also search among the tmpfs mounts
        for dest := range container.HostConfig.Tmpfs {
                if dest == path {
                        return true
                }
        }

        return false
}

// UnmountIpcMount unmounts shm if it was mounted
func (container *Container) UnmountIpcMount() error {
        if container.HasMountFor("/dev/shm") {
                return nil
        }

        // container.ShmPath should not be used here as it may point
        // to the host's or other container's /dev/shm
        shmPath, err := container.ShmResourcePath()
        if err != nil {
                return err
        }
        if shmPath == "" {
                return nil
        }
        if err = mount.Unmount(shmPath); err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }
        return nil
}

// IpcMounts returns the list of IPC mounts
func (container *Container) IpcMounts() []Mount {
        var mounts []Mount
        parser := volumemounts.NewParser()

        if container.HasMountFor("/dev/shm") {
                return mounts
        }
        if container.ShmPath == "" {
                return mounts
        }

        label.SetFileLabel(container.ShmPath, container.MountLabel)
        mounts = append(mounts, Mount{
                Source:      container.ShmPath,
                Destination: "/dev/shm",
                Writable:    true,
                Propagation: string(parser.DefaultPropagationMode()),
        })

        return mounts
}

// SecretMounts returns the mounts for the secret path.
func (container *Container) SecretMounts() ([]Mount, error) {
        var mounts []Mount
        for _, r := range container.SecretReferences {
                if r.File == nil {
                        continue
                }
                src, err := container.SecretFilePath(*r)
                if err != nil {
                        return nil, err
                }
                mounts = append(mounts, Mount{
                        Source:      src,
                        Destination: getSecretTargetPath(r),
                        Writable:    false,
                })
        }
        for _, r := range container.ConfigReferences {
                fPath, err := container.ConfigFilePath(*r)
                if err != nil {
                        return nil, err
                }
                mounts = append(mounts, Mount{
                        Source:      fPath,
                        Destination: getConfigTargetPath(r),
                        Writable:    false,
                })
        }

        return mounts, nil
}

// UnmountSecrets unmounts the local tmpfs for secrets
func (container *Container) UnmountSecrets() error {
        p, err := container.SecretMountPath()
        if err != nil {
                return err
        }
        if _, err := os.Stat(p); err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return err
        }

        return mount.RecursiveUnmount(p)
}

// UpdateContainer updates configuration of a container. Callers must hold a Lock on the Container.
func (container *Container) UpdateContainer(hostConfig *containertypes.HostConfig) error {
        // update resources of container
        resources := hostConfig.Resources
        cResources := &container.HostConfig.Resources

        // validate NanoCPUs, CPUPeriod, and CPUQuota
        // Because NanoCPU effectively updates CPUPeriod/CPUQuota,
        // once NanoCPU is already set, updating CPUPeriod/CPUQuota will be blocked, and vice versa.
        // In the following we make sure the intended update (resources) does not conflict with the existing (cResource).
        if resources.NanoCPUs > 0 && cResources.CPUPeriod > 0 {
                return conflictingUpdateOptions("Conflicting options: Nano CPUs cannot be updated as CPU Period has already been set")
        }
        if resources.NanoCPUs > 0 && cResources.CPUQuota > 0 {
                return conflictingUpdateOptions("Conflicting options: Nano CPUs cannot be updated as CPU Quota has already been set")
        }
        if resources.CPUPeriod > 0 && cResources.NanoCPUs > 0 {
                return conflictingUpdateOptions("Conflicting options: CPU Period cannot be updated as NanoCPUs has already been set")
        }
        if resources.CPUQuota > 0 && cResources.NanoCPUs > 0 {
                return conflictingUpdateOptions("Conflicting options: CPU Quota cannot be updated as NanoCPUs has already been set")
        }

        if resources.BlkioWeight != 0 {
                cResources.BlkioWeight = resources.BlkioWeight
        }
        if resources.CPUShares != 0 {
                cResources.CPUShares = resources.CPUShares
        }
        if resources.NanoCPUs != 0 {
                cResources.NanoCPUs = resources.NanoCPUs
        }
        if resources.CPUPeriod != 0 {
                cResources.CPUPeriod = resources.CPUPeriod
        }
        if resources.CPUQuota != 0 {
                cResources.CPUQuota = resources.CPUQuota
        }
        if resources.CpusetCpus != "" {
                cResources.CpusetCpus = resources.CpusetCpus
        }
        if resources.CpusetMems != "" {
                cResources.CpusetMems = resources.CpusetMems
        }
        if resources.Memory != 0 {
                // if memory limit smaller than already set memoryswap limit and doesn't
                // update the memoryswap limit, then error out.
                if resources.Memory > cResources.MemorySwap && resources.MemorySwap == 0 {
                        return conflictingUpdateOptions("Memory limit should be smaller than already set memoryswap limit, update the memoryswap at the same time")
                }
                cResources.Memory = resources.Memory
        }
        if resources.MemorySwap != 0 {
                cResources.MemorySwap = resources.MemorySwap
        }
        if resources.MemoryReservation != 0 {
                cResources.MemoryReservation = resources.MemoryReservation
        }
        if resources.KernelMemory != 0 {
                cResources.KernelMemory = resources.KernelMemory
        }
        if resources.CPURealtimePeriod != 0 {
                cResources.CPURealtimePeriod = resources.CPURealtimePeriod
        }
        if resources.CPURealtimeRuntime != 0 {
                cResources.CPURealtimeRuntime = resources.CPURealtimeRuntime
        }
        if resources.PidsLimit != nil {
                cResources.PidsLimit = resources.PidsLimit
        }

        // update HostConfig of container
        if hostConfig.RestartPolicy.Name != "" {
                if container.HostConfig.AutoRemove && !hostConfig.RestartPolicy.IsNone() {
                        return conflictingUpdateOptions("Restart policy cannot be updated because AutoRemove is enabled for the container")
                }
                container.HostConfig.RestartPolicy = hostConfig.RestartPolicy
        }

        return nil
}

// DetachAndUnmount uses a detached mount on all mount destinations, then
// unmounts each volume normally.
// This is used from daemon/archive for `docker cp`
func (container *Container) DetachAndUnmount(volumeEventLog func(name string, action events.Action, attributes map[string]string)) error {
        ctx := context.TODO()

        networkMounts := container.NetworkMounts()
        mountPaths := make([]string, 0, len(container.MountPoints)+len(networkMounts))

        for _, mntPoint := range container.MountPoints {
                dest, err := container.GetResourcePath(mntPoint.Destination)
                if err != nil {
                        log.G(ctx).Warnf("Failed to get volume destination path for container '%s' at '%s' while lazily unmounting: %v", container.ID, mntPoint.Destination, err)
                        continue
                }
                mountPaths = append(mountPaths, dest)
        }

        for _, m := range networkMounts {
                dest, err := container.GetResourcePath(m.Destination)
                if err != nil {
                        log.G(ctx).Warnf("Failed to get volume destination path for container '%s' at '%s' while lazily unmounting: %v", container.ID, m.Destination, err)
                        continue
                }
                mountPaths = append(mountPaths, dest)
        }

        for _, mountPath := range mountPaths {
                if err := mount.Unmount(mountPath); err != nil {
                        log.G(ctx).WithError(err).WithField("container", container.ID).
                                Warn("Unable to unmount")
                }
        }
        return container.UnmountVolumes(ctx, volumeEventLog)
}

// ignoreUnsupportedXAttrs ignores errors when extended attributes
// are not supported
func ignoreUnsupportedXAttrs() fs.CopyDirOpt {
        return fs.WithXAttrErrorHandler(func(dst, src, xattrKey string, err error) error {
                if !errors.Is(err, syscall.ENOTSUP) {
                        return err
                }
                return nil
        })
}

// copyExistingContents copies from the source to the destination and
// ensures the ownership is appropriately set.
func copyExistingContents(source, destination string) error {
        dstList, err := os.ReadDir(destination)
        if err != nil {
                return err
        }
        if len(dstList) != 0 {
                log.G(context.TODO()).WithFields(log.Fields{
                        "source":      source,
                        "destination": destination,
                }).Debug("destination is not empty, do not copy")
                return nil
        }

        return fs.CopyDir(destination, source, ignoreUnsupportedXAttrs())
}

// TmpfsMounts returns the list of tmpfs mounts
func (container *Container) TmpfsMounts() ([]Mount, error) {
        var mounts []Mount
        for dest, data := range container.HostConfig.Tmpfs {
                mounts = append(mounts, Mount{
                        Source:      "tmpfs",
                        Destination: dest,
                        Data:        data,
                })
        }
        parser := volumemounts.NewParser()
        for dest, mnt := range container.MountPoints {
                if mnt.Type == mounttypes.TypeTmpfs {
                        data, err := parser.ConvertTmpfsOptions(mnt.Spec.TmpfsOptions, mnt.Spec.ReadOnly)
                        if err != nil {
                                return nil, err
                        }
                        mounts = append(mounts, Mount{
                                Source:      "tmpfs",
                                Destination: dest,
                                Data:        data,
                        })
                }
        }
        return mounts, nil
}

// GetMountPoints gives a platform specific transformation to types.MountPoint. Callers must hold a Container lock.
func (container *Container) GetMountPoints() []containertypes.MountPoint {
        mountPoints := make([]containertypes.MountPoint, 0, len(container.MountPoints))
        for _, m := range container.MountPoints {
                mountPoints = append(mountPoints, containertypes.MountPoint{
                        Type:        m.Type,
                        Name:        m.Name,
                        Source:      m.Path(),
                        Destination: m.Destination,
                        Driver:      m.Driver,
                        Mode:        m.Mode,
                        RW:          m.RW,
                        Propagation: m.Propagation,
                })
        }
        return mountPoints
}

// ConfigFilePath returns the path to the on-disk location of a config.
// On unix, configs are always considered secret
func (container *Container) ConfigFilePath(configRef swarmtypes.ConfigReference) (string, error) {
        mounts, err := container.SecretMountPath()
        if err != nil {
                return "", err
        }
        return filepath.Join(mounts, configRef.ConfigID), nil
}

package container

import (
        "strings"
)

// ReplaceOrAppendEnvValues returns the defaults with the overrides either
// replaced by env key or appended to the list
func ReplaceOrAppendEnvValues(defaults, overrides []string) []string {
        cache := make(map[string]int, len(defaults))
        for i, e := range defaults {
                index := strings.Index(e, "=")
                cache[e[:index]] = i
        }

        for _, value := range overrides {
                // Values w/o = means they want this env to be removed/unset.
                index := strings.Index(value, "=")
                if index < 0 {
                        // no "=" in value
                        if i, exists := cache[value]; exists {
                                defaults[i] = "" // Used to indicate it should be removed
                        }
                        continue
                }

                if i, exists := cache[value[:index]]; exists {
                        defaults[i] = value
                } else {
                        defaults = append(defaults, value)
                }
        }

        // Now remove all entries that we want to "unset"
        for i := 0; i < len(defaults); i++ {
                if defaults[i] == "" {
                        defaults = append(defaults[:i], defaults[i+1:]...)
                        i--
                }
        }

        return defaults
}

package container

import (
        "context"
        "runtime"
        "sync"

        "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/internal/libcontainerd/types"
        "github.com/docker/docker/daemon/internal/stream"
        "github.com/docker/docker/pkg/stringid"
)

// ExecConfig holds the configurations for execs. The Daemon keeps
// track of both running and finished execs so that they can be
// examined both during and after completion.
type ExecConfig struct {
        sync.Mutex
        Started      chan struct{}
        StreamConfig *stream.Config
        ID           string
        Running      bool
        ExitCode     *int
        OpenStdin    bool
        OpenStderr   bool
        OpenStdout   bool
        CanRemove    bool
        Container    *Container
        DetachKeys   []byte
        Entrypoint   string
        Args         []string
        Tty          bool
        Privileged   bool
        User         string
        WorkingDir   string
        Env          []string
        Process      types.Process
        ConsoleSize  *[2]uint
}

// NewExecConfig initializes the a new exec configuration
func NewExecConfig(c *Container) *ExecConfig {
        return &ExecConfig{
                ID:           stringid.GenerateRandomID(),
                Container:    c,
                StreamConfig: stream.NewConfig(),
                Started:      make(chan struct{}),
        }
}

// InitializeStdio is called by libcontainerd to connect the stdio.
func (c *ExecConfig) InitializeStdio(iop *cio.DirectIO) (cio.IO, error) {
        c.StreamConfig.CopyToPipe(iop)

        if c.StreamConfig.Stdin() == nil && !c.Tty && runtime.GOOS == "windows" {
                if iop.Stdin != nil {
                        if err := iop.Stdin.Close(); err != nil {
                                log.G(context.TODO()).Errorf("error closing exec stdin: %+v", err)
                        }
                }
        }

        return &rio{IO: iop, sc: c.StreamConfig}, nil
}

// CloseStreams closes the stdio streams for the exec
func (c *ExecConfig) CloseStreams() error {
        return c.StreamConfig.CloseStreams()
}

// SetExitCode sets the exec config's exit code
func (c *ExecConfig) SetExitCode(code int) {
        c.ExitCode = &code
}

// ExecStore keeps track of the exec configurations.
type ExecStore struct {
        byID map[string]*ExecConfig
        mu   sync.RWMutex
}

// NewExecStore initializes a new exec store.
func NewExecStore() *ExecStore {
        return &ExecStore{
                byID: make(map[string]*ExecConfig),
        }
}

// Commands returns the exec configurations in the store.
func (e *ExecStore) Commands() map[string]*ExecConfig {
        e.mu.RLock()
        byID := make(map[string]*ExecConfig, len(e.byID))
        for id, config := range e.byID {
                byID[id] = config
        }
        e.mu.RUnlock()
        return byID
}

// Add adds a new exec configuration to the store.
func (e *ExecStore) Add(id string, Config *ExecConfig) {
        e.mu.Lock()
        e.byID[id] = Config
        e.mu.Unlock()
}

// Get returns an exec configuration by its id.
func (e *ExecStore) Get(id string) *ExecConfig {
        e.mu.RLock()
        res := e.byID[id]
        e.mu.RUnlock()
        return res
}

// Delete removes an exec configuration from the store.
func (e *ExecStore) Delete(id string) {
        e.mu.Lock()
        delete(e.byID, id)
        e.mu.Unlock()
}

// List returns the list of exec ids in the store.
func (e *ExecStore) List() []string {
        var IDs []string
        e.mu.RLock()
        for id := range e.byID {
                IDs = append(IDs, id)
        }
        e.mu.RUnlock()
        return IDs
}

package container

import (
        "context"
        "sync"

        "github.com/containerd/log"
        "github.com/moby/moby/api/types/container"
)

// Health holds the current container health-check state
type Health struct {
        container.Health
        stop chan struct{} // Write struct{} to stop the monitor
        mu   sync.Mutex
}

// String returns a human-readable description of the health-check state
func (s *Health) String() string {
        status := s.Status()

        switch status {
        case container.Starting:
                return "health: starting"
        default: // Healthy and Unhealthy are clear on their own
                return status
        }
}

// Status returns the current health status.
//
// Note that this takes a lock and the value may change after being read.
func (s *Health) Status() container.HealthStatus {
        s.mu.Lock()
        defer s.mu.Unlock()

        // This happens when the monitor has yet to be setup.
        if s.Health.Status == "" {
                return container.Unhealthy
        }

        return s.Health.Status
}

// SetStatus writes the current status to the underlying health structure,
// obeying the locking semantics.
//
// Status may be set directly if another lock is used.
func (s *Health) SetStatus(healthStatus container.HealthStatus) {
        s.mu.Lock()
        defer s.mu.Unlock()

        s.Health.Status = healthStatus
}

// OpenMonitorChannel creates and returns a new monitor channel. If there
// already is one, it returns nil.
func (s *Health) OpenMonitorChannel() chan struct{} {
        s.mu.Lock()
        defer s.mu.Unlock()

        if s.stop == nil {
                log.G(context.TODO()).Debug("OpenMonitorChannel")
                s.stop = make(chan struct{})
                return s.stop
        }
        return nil
}

// CloseMonitorChannel closes any existing monitor channel.
func (s *Health) CloseMonitorChannel() {
        s.mu.Lock()
        defer s.mu.Unlock()

        if s.stop != nil {
                log.G(context.TODO()).Debug("CloseMonitorChannel: waiting for probe to stop")
                close(s.stop)
                s.stop = nil
                // unhealthy when the monitor has stopped for compatibility reasons
                s.Health.Status = container.Unhealthy
                log.G(context.TODO()).Debug("CloseMonitorChannel done")
        }
}

package container

import "sort"

// History is a convenience type for storing a list of containers,
// sorted by creation date in descendant order.
type History []*Container

// Len returns the number of containers in the history.
func (history *History) Len() int {
        return len(*history)
}

// Less compares two containers and returns true if the second one
// was created before the first one.
func (history *History) Less(i, j int) bool {
        containers := *history
        return containers[j].Created.Before(containers[i].Created)
}

// Swap switches containers i and j positions in the history.
func (history *History) Swap(i, j int) {
        containers := *history
        containers[i], containers[j] = containers[j], containers[i]
}

// sort orders the history by creation date in descendant order.
func (history *History) sort() {
        sort.Sort(history)
}

package container

import (
        "sync"
)

// memoryStore implements a Store in memory.
type memoryStore struct {
        s map[string]*Container
        sync.RWMutex
}

// NewMemoryStore initializes a new memory store.
func NewMemoryStore() Store {
        return &memoryStore{
                s: make(map[string]*Container),
        }
}

// Add appends a new container to the memory store.
// It overrides the id if it existed before.
func (c *memoryStore) Add(id string, cont *Container) {
        c.Lock()
        c.s[id] = cont
        c.Unlock()
}

// Get returns a container from the store by id.
func (c *memoryStore) Get(id string) *Container {
        var res *Container
        c.RLock()
        res = c.s[id]
        c.RUnlock()
        return res
}

// Delete removes a container from the store by id.
func (c *memoryStore) Delete(id string) {
        c.Lock()
        delete(c.s, id)
        c.Unlock()
}

// List returns a sorted list of containers from the store.
// The containers are ordered by creation date.
func (c *memoryStore) List() []*Container {
        containers := History(c.all())
        containers.sort()
        return containers
}

// Size returns the number of containers in the store.
func (c *memoryStore) Size() int {
        c.RLock()
        defer c.RUnlock()
        return len(c.s)
}

// First returns the first container found in the store by a given filter.
func (c *memoryStore) First(filter StoreFilter) *Container {
        for _, cont := range c.all() {
                if filter(cont) {
                        return cont
                }
        }
        return nil
}

// ApplyAll calls the reducer function with every container in the store.
// This operation is asynchronous in the memory store.
// NOTE: Modifications to the store MUST NOT be done by the StoreReducer.
func (c *memoryStore) ApplyAll(apply StoreReducer) {
        wg := new(sync.WaitGroup)
        for _, cont := range c.all() {
                wg.Add(1)
                go func(container *Container) {
                        apply(container)
                        wg.Done()
                }(cont)
        }

        wg.Wait()
}

func (c *memoryStore) all() []*Container {
        c.RLock()
        containers := make([]*Container, 0, len(c.s))
        for _, cont := range c.s {
                containers = append(containers, cont)
        }
        c.RUnlock()
        return containers
}

var _ Store = &memoryStore{}

package container

import (
        "context"
        "time"

        "github.com/containerd/log"
)

const (
        loggerCloseTimeout = 10 * time.Second
)

// Reset puts a container into a state where it can be restarted again.
func (container *Container) Reset(lock bool) {
        if lock {
                container.Lock()
                defer container.Unlock()
        }

        if err := container.CloseStreams(); err != nil {
                log.G(context.TODO()).Errorf("%s: %s", container.ID, err)
        }

        // Re-create a brand new stdin pipe once the container exited
        if container.Config.OpenStdin {
                container.StreamConfig.NewInputPipes()
        }

        if container.LogDriver != nil {
                if container.LogCopier != nil {
                        exit := make(chan struct{})
                        go func() {
                                container.LogCopier.Wait()
                                close(exit)
                        }()

                        timer := time.NewTimer(loggerCloseTimeout)
                        defer timer.Stop()
                        select {
                        case <-timer.C:
                                log.G(context.TODO()).Warn("Logger didn't exit in time: logs may be truncated")
                        case <-exit:
                        }
                }
                container.LogDriver.Close()
                container.LogCopier = nil
                container.LogDriver = nil
        }
}

package container

import (
        "context"
        "errors"
        "fmt"
        "sync"
        "time"

        libcontainerdtypes "github.com/docker/docker/daemon/internal/libcontainerd/types"
        "github.com/docker/go-units"
        "github.com/moby/moby/api/types/container"
)

// State holds the current container state, and has methods to get and
// set the state. State is embedded in the [Container] struct.
//
// State contains an exported [sync.Mutex] which is used as a global lock
// for both the State and the Container it's embedded in.
type State struct {
        // This Mutex is exported by design and is used as a global lock
        // for both the State and the Container it's embedded in.
        sync.Mutex
        // Note that [State.Running], [State.Restarting], and [State.Paused] are
        // not mutually exclusive.
        //
        // When pausing a container (on Linux), the freezer cgroup is used to suspend
        // all processes in the container. Freezing the process requires the process to
        // be running. As a result, paused containers can have both [State.Running]
        // and [State.Paused] set to true.
        //
        // In a similar fashion, [State.Running] and [State.Restarting] can both
        // be true in a situation where a container is in process of being restarted.
        // Refer to [State.StateString] for order of precedence.
        Running           bool
        Paused            bool
        Restarting        bool
        OOMKilled         bool
        RemovalInProgress bool `json:"-"` // No need for this to be persistent on disk.
        Dead              bool
        Pid               int
        ExitCodeValue     int    `json:"ExitCode"`
        ErrorMsg          string `json:"Error"` // contains last known error during container start, stop, or remove
        StartedAt         time.Time
        FinishedAt        time.Time
        Health            *Health
        Removed           bool `json:"-"`

        stopWaiters       []chan<- container.StateStatus
        removeOnlyWaiters []chan<- container.StateStatus

        // The libcontainerd reference fields are unexported to force consumers
        // to access them through the getter methods with multi-valued returns
        // so that they can't forget to nil-check: the code won't compile unless
        // the nil-check result is explicitly consumed or discarded.

        ctr  libcontainerdtypes.Container
        task libcontainerdtypes.Task
}

// NewState creates a default state object.
func NewState() *State {
        return &State{}
}

// String returns a human-readable description of the state
func (s *State) String() string {
        if s.Running {
                if s.Paused {
                        return fmt.Sprintf("Up %s (Paused)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
                }
                if s.Restarting {
                        return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCodeValue, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
                }

                if h := s.Health; h != nil {
                        return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String())
                }

                return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
        }

        if s.RemovalInProgress {
                return "Removal In Progress"
        }

        if s.Dead {
                return "Dead"
        }

        if s.StartedAt.IsZero() {
                return "Created"
        }

        if s.FinishedAt.IsZero() {
                return ""
        }

        return fmt.Sprintf("Exited (%d) %s ago", s.ExitCodeValue, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
}

// StateString returns the container's current [ContainerState], based on the
// [State.Running], [State.Paused], [State.Restarting], [State.RemovalInProgress],
// [State.StartedAt] and [State.Dead] fields.
func (s *State) StateString() container.ContainerState {
        if s.Running {
                if s.Paused {
                        return container.StatePaused
                }
                if s.Restarting {
                        return container.StateRestarting
                }
                return container.StateRunning
        }

        // TODO(thaJeztah): should [State.Removed] also have an corresponding string?
        // TODO(thaJeztah): should [State.OOMKilled] be taken into account anywhere?
        if s.RemovalInProgress {
                return container.StateRemoving
        }

        if s.Dead {
                return container.StateDead
        }

        if s.StartedAt.IsZero() {
                return container.StateCreated
        }

        return container.StateExited
}

// Wait waits until the container is in a certain state indicated by the given
// condition. A context must be used for cancelling the request, controlling
// timeouts, and avoiding goroutine leaks. Wait must be called without holding
// the state lock. Returns a channel from which the caller will receive the
// result. If the container exited on its own, the result's Err() method will
// be nil and its ExitCode() method will return the container's exit code,
// otherwise, the results Err() method will return an error indicating why the
// wait operation failed.
func (s *State) Wait(ctx context.Context, condition container.WaitCondition) <-chan container.StateStatus {
        s.Lock()
        defer s.Unlock()

        // Buffer so we can put status and finish even nobody receives it.
        resultC := make(chan container.StateStatus, 1)

        if s.conditionAlreadyMet(condition) {
                resultC <- container.NewStateStatus(s.ExitCode(), s.Err())

                return resultC
        }

        waitC := make(chan container.StateStatus, 1)

        // Removal wakes up both removeOnlyWaiters and stopWaiters
        // Container could be removed while still in "created" state
        // in which case it is never actually stopped
        if condition == container.WaitConditionRemoved {
                s.removeOnlyWaiters = append(s.removeOnlyWaiters, waitC)
        } else {
                s.stopWaiters = append(s.stopWaiters, waitC)
        }

        go func() {
                select {
                case <-ctx.Done():
                        // Context timeout or cancellation.
                        resultC <- container.NewStateStatus(-1, ctx.Err())

                        return
                case status := <-waitC:
                        resultC <- status
                }
        }()

        return resultC
}

func (s *State) conditionAlreadyMet(condition container.WaitCondition) bool {
        switch condition {
        case container.WaitConditionNotRunning:
                return !s.Running
        case container.WaitConditionRemoved:
                return s.Removed
        default:
                // TODO(thaJeztah): how do we want to handle "WaitConditionNextExit"?
                return false
        }
}

// IsRunning returns whether the [State.Running] flag is set.
//
// Note that [State.Running], [State.Restarting], and [State.Paused] are
// not mutually exclusive.
//
// When pausing a container (on Linux), the freezer cgroup is used to suspend
// all processes in the container. Freezing the process requires the process to
// be running. As a result, paused containers can have both [State.Running]
// and [State.Paused] set to true.
//
// In a similar fashion, [State.Running] and [State.Restarting] can both
// be true in a situation where a container is in process of being restarted.
// Refer to [State.StateString] for order of precedence.
func (s *State) IsRunning() bool {
        s.Lock()
        defer s.Unlock()
        return s.Running
}

// GetPID holds the process id of a container.
func (s *State) GetPID() int {
        s.Lock()
        defer s.Unlock()
        return s.Pid
}

// ExitCode returns current exitcode for the state. Take lock before if state
// may be shared.
func (s *State) ExitCode() int {
        return s.ExitCodeValue
}

// SetExitCode sets current exitcode for the state. Take lock before if state
// may be shared.
func (s *State) SetExitCode(ec int) {
        s.ExitCodeValue = ec
}

// SetRunning sets the running state along with StartedAt time.
func (s *State) SetRunning(ctr libcontainerdtypes.Container, tsk libcontainerdtypes.Task, start time.Time) {
        s.setRunning(ctr, tsk, &start)
}

// SetRunningExternal sets the running state without setting the `StartedAt` time (used for containers not started by Docker instead of SetRunning).
func (s *State) SetRunningExternal(ctr libcontainerdtypes.Container, tsk libcontainerdtypes.Task) {
        s.setRunning(ctr, tsk, nil)
}

// setRunning sets the state of the container to "running".
func (s *State) setRunning(ctr libcontainerdtypes.Container, tsk libcontainerdtypes.Task, start *time.Time) {
        s.ErrorMsg = ""
        s.Paused = false
        s.Running = true
        s.Restarting = false
        if start != nil {
                s.Paused = false
        }
        s.ExitCodeValue = 0
        s.ctr = ctr
        s.task = tsk
        if tsk != nil {
                s.Pid = int(tsk.Pid())
        } else {
                s.Pid = 0
        }
        s.OOMKilled = false
        if start != nil {
                s.StartedAt = start.UTC()
        }
}

// SetStopped sets the container state to "stopped" without locking.
func (s *State) SetStopped(exitStatus *ExitStatus) {
        s.Running = false
        s.Paused = false
        s.Restarting = false
        s.Pid = 0
        if exitStatus.ExitedAt.IsZero() {
                s.FinishedAt = time.Now().UTC()
        } else {
                s.FinishedAt = exitStatus.ExitedAt
        }
        s.ExitCodeValue = exitStatus.ExitCode

        s.notifyAndClear(&s.stopWaiters)
}

// SetRestarting sets the container state to "restarting" without locking.
// It also sets the container PID to 0.
func (s *State) SetRestarting(exitStatus *ExitStatus) {
        // we should consider the container running when it is restarting because of
        // all the checks in docker around rm/stop/etc
        s.Running = true
        s.Restarting = true
        s.Paused = false
        s.Pid = 0
        s.FinishedAt = time.Now().UTC()
        s.ExitCodeValue = exitStatus.ExitCode

        s.notifyAndClear(&s.stopWaiters)
}

// SetError sets the container's error state. This is useful when we want to
// know the error that occurred when container transits to another state
// when inspecting it
func (s *State) SetError(err error) {
        s.ErrorMsg = ""
        if err != nil {
                s.ErrorMsg = err.Error()
        }
}

// IsPaused returns whether the container is paused.
//
// Note that [State.Running], [State.Restarting], and [State.Paused] are
// not mutually exclusive.
//
// When pausing a container (on Linux), the freezer cgroup is used to suspend
// all processes in the container. Freezing the process requires the process to
// be running. As a result, paused containers can have both [State.Running]
// and [State.Paused] set to true.
//
// In a similar fashion, [State.Running] and [State.Restarting] can both
// be true in a situation where a container is in process of being restarted.
// Refer to [State.StateString] for order of precedence.
func (s *State) IsPaused() bool {
        s.Lock()
        defer s.Unlock()
        return s.Paused
}

// IsRestarting returns whether the container is restarting.
//
// Note that [State.Running], [State.Restarting], and [State.Paused] are
// not mutually exclusive.
//
// When pausing a container (on Linux), the freezer cgroup is used to suspend
// all processes in the container. Freezing the process requires the process to
// be running. As a result, paused containers can have both [State.Running]
// and [State.Paused] set to true.
//
// In a similar fashion, [State.Running] and [State.Restarting] can both
// be true in a situation where a container is in process of being restarted.
// Refer to [State.StateString] for order of precedence.
func (s *State) IsRestarting() bool {
        s.Lock()
        defer s.Unlock()
        return s.Restarting
}

// SetRemovalInProgress sets the container state as being removed.
// It returns true if the container was already in that state.
func (s *State) SetRemovalInProgress() bool {
        s.Lock()
        defer s.Unlock()
        if s.RemovalInProgress {
                return true
        }
        s.RemovalInProgress = true
        return false
}

// ResetRemovalInProgress makes the RemovalInProgress state to false.
func (s *State) ResetRemovalInProgress() {
        s.Lock()
        s.RemovalInProgress = false
        s.Unlock()
}

// IsRemovalInProgress returns whether the RemovalInProgress flag is set.
// Used by Container to check whether a container is being removed.
func (s *State) IsRemovalInProgress() bool {
        s.Lock()
        defer s.Unlock()
        return s.RemovalInProgress
}

// IsDead returns whether the Dead flag is set. Used by Container to check whether a container is dead.
func (s *State) IsDead() bool {
        s.Lock()
        defer s.Unlock()
        return s.Dead
}

// SetRemoved assumes this container is already in the "dead" state and notifies all waiters.
func (s *State) SetRemoved() {
        s.SetRemovalError(nil)
}

// SetRemovalError is to be called in case a container remove failed.
// It sets an error and notifies all waiters.
func (s *State) SetRemovalError(err error) {
        s.SetError(err)
        s.Lock()
        s.Removed = true
        s.notifyAndClear(&s.removeOnlyWaiters)
        s.notifyAndClear(&s.stopWaiters)
        s.Unlock()
}

// Err returns an error if there is one.
func (s *State) Err() error {
        if s.ErrorMsg != "" {
                return errors.New(s.ErrorMsg)
        }
        return nil
}

func (s *State) notifyAndClear(waiters *[]chan<- container.StateStatus) {
        result := container.NewStateStatus(s.ExitCodeValue, s.Err())

        for _, c := range *waiters {
                c <- result
        }
        *waiters = nil
}

// C8dContainer returns a reference to the libcontainerd Container object for
// the container and whether the reference is valid.
//
// The container lock must be held when calling this method.
func (s *State) C8dContainer() (_ libcontainerdtypes.Container, ok bool) {
        return s.ctr, s.ctr != nil
}

// Task returns a reference to the libcontainerd Task object for the container
// and whether the reference is valid.
//
// The container lock must be held when calling this method.
//
// See also: (*Container).GetRunningTask().
func (s *State) Task() (_ libcontainerdtypes.Task, ok bool) {
        return s.task, s.task != nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package container

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "maps"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/errdefs"
        "github.com/docker/go-connections/nat"
        memdb "github.com/hashicorp/go-memdb"
        "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/network"
)

const (
        memdbContainersTable  = "containers"
        memdbNamesTable       = "names"
        memdbIDIndex          = "id"
        memdbIDIndexPrefix    = "id_prefix"
        memdbContainerIDIndex = "containerid"
)

// Snapshot is a read only view for Containers. It holds all information necessary to serve container queries in a
// versioned ACID in-memory store.
type Snapshot struct {
        container.Summary

        // additional info queries need to filter on
        // preserve nanosec resolution for queries
        CreatedAt    time.Time
        StartedAt    time.Time
        Name         string
        Pid          int
        ExitCode     int
        Running      bool
        Paused       bool
        Managed      bool
        ExposedPorts nat.PortSet
        PortBindings nat.PortSet
        Health       container.HealthStatus
        HostConfig   struct {
                Isolation string
        }
}

// nameAssociation associates a container id with a name.
type nameAssociation struct {
        // name is the name to associate. Note that name is the primary key
        // ("id" in memdb).
        name        string
        containerID string
}

var schema = &memdb.DBSchema{
        Tables: map[string]*memdb.TableSchema{
                memdbContainersTable: {
                        Name: memdbContainersTable,
                        Indexes: map[string]*memdb.IndexSchema{
                                memdbIDIndex: {
                                        Name:    memdbIDIndex,
                                        Unique:  true,
                                        Indexer: &containerByIDIndexer{},
                                },
                        },
                },
                memdbNamesTable: {
                        Name: memdbNamesTable,
                        Indexes: map[string]*memdb.IndexSchema{
                                // Used for names, because "id" is the primary key in memdb.
                                memdbIDIndex: {
                                        Name:    memdbIDIndex,
                                        Unique:  true,
                                        Indexer: &namesByNameIndexer{},
                                },
                                memdbContainerIDIndex: {
                                        Name:    memdbContainerIDIndex,
                                        Indexer: &namesByContainerIDIndexer{},
                                },
                        },
                },
        },
}

// ViewDB provides an in-memory transactional (ACID) container store.
type ViewDB struct {
        store *memdb.MemDB
}

// NewViewDB provides the default implementation, with the default schema
func NewViewDB() (*ViewDB, error) {
        store, err := memdb.NewMemDB(schema)
        if err != nil {
                return nil, errdefs.System(err)
        }
        return &ViewDB{store: store}, nil
}

// GetByPrefix returns a container with the given ID prefix. It returns an
// error if an empty prefix was given or if multiple containers match the prefix.
// It returns an [errdefs.NotFound] if the given s yielded no results.
func (db *ViewDB) GetByPrefix(s string) (string, error) {
        if s == "" {
                return "", errdefs.InvalidParameter(errors.New("prefix can't be empty"))
        }
        iter, err := db.store.Txn(false).Get(memdbContainersTable, memdbIDIndexPrefix, s)
        if err != nil {
                return "", errdefs.System(err)
        }

        var id string
        for {
                item := iter.Next()
                if item == nil {
                        break
                }
                if id != "" {
                        return "", errdefs.InvalidParameter(errors.New("multiple IDs found with provided prefix: " + s))
                }
                id = item.(*Container).ID
        }

        if id != "" {
                return id, nil
        }

        return "", errdefs.NotFound(errors.New("No such container: " + s))
}

// Snapshot provides a consistent read-only view of the database.
func (db *ViewDB) Snapshot() *View {
        return &View{
                txn: db.store.Txn(false),
        }
}

func (db *ViewDB) withTxn(cb func(*memdb.Txn) error) error {
        txn := db.store.Txn(true)
        err := cb(txn)
        if err != nil {
                txn.Abort()
                return err
        }
        txn.Commit()
        return nil
}

// Save atomically updates the in-memory store state for a Container.
// Only read only (deep) copies of containers may be passed in.
func (db *ViewDB) Save(c *Container) error {
        return db.withTxn(func(txn *memdb.Txn) error {
                return txn.Insert(memdbContainersTable, c)
        })
}

// Delete removes an item by ID
func (db *ViewDB) Delete(c *Container) error {
        return db.withTxn(func(txn *memdb.Txn) error {
                view := &View{txn: txn}
                names := view.getNames(c.ID)

                for _, name := range names {
                        txn.Delete(memdbNamesTable, nameAssociation{name: name})
                }

                // Ignore error - the container may not actually exist in the
                // db, but we still need to clean up associated names.
                txn.Delete(memdbContainersTable, NewBaseContainer(c.ID, c.Root))
                return nil
        })
}

// ReserveName registers a container ID to a name. ReserveName is idempotent,
// but returns an [errdefs.Conflict] when attempting to reserve a container ID
// to a name that already is reserved.
func (db *ViewDB) ReserveName(name, containerID string) error {
        return db.withTxn(func(txn *memdb.Txn) error {
                s, err := txn.First(memdbNamesTable, memdbIDIndex, name)
                if err != nil {
                        return errdefs.System(err)
                }
                if s != nil {
                        if s.(nameAssociation).containerID != containerID {
                                return errdefs.Conflict(errors.New("name is reserved"))
                        }
                        return nil
                }
                return txn.Insert(memdbNamesTable, nameAssociation{name: name, containerID: containerID})
        })
}

// ReleaseName releases the reserved name
// Once released, a name can be reserved again
func (db *ViewDB) ReleaseName(name string) error {
        return db.withTxn(func(txn *memdb.Txn) error {
                return txn.Delete(memdbNamesTable, nameAssociation{name: name})
        })
}

// View provides a consistent read-only view of the database.
type View struct {
        txn *memdb.Txn
}

// All returns a all items in this snapshot. Returned objects must never be modified.
func (v *View) All() ([]Snapshot, error) {
        var all []Snapshot
        iter, err := v.txn.Get(memdbContainersTable, memdbIDIndex)
        if err != nil {
                return nil, errdefs.System(err)
        }
        for {
                item := iter.Next()
                if item == nil {
                        break
                }
                snapshot := v.transform(item.(*Container))
                all = append(all, *snapshot)
        }
        return all, nil
}

// Get returns an item by id. Returned objects must never be modified.
// It returns an [errdefs.NotFound] if the given id was not found.
func (v *View) Get(id string) (*Snapshot, error) {
        s, err := v.txn.First(memdbContainersTable, memdbIDIndex, id)
        if err != nil {
                return nil, errdefs.System(err)
        }
        if s == nil {
                return nil, errdefs.NotFound(errors.New("No such container: " + id))
        }
        return v.transform(s.(*Container)), nil
}

// getNames lists all the reserved names for the given container ID.
func (v *View) getNames(containerID string) []string {
        iter, err := v.txn.Get(memdbNamesTable, memdbContainerIDIndex, containerID)
        if err != nil {
                return nil
        }

        var names []string
        for {
                item := iter.Next()
                if item == nil {
                        break
                }
                names = append(names, item.(nameAssociation).name)
        }

        return names
}

// GetID returns the container ID that the passed in name is reserved to.
// It returns an [errdefs.NotFound] if the given id was not found.
func (v *View) GetID(name string) (string, error) {
        s, err := v.txn.First(memdbNamesTable, memdbIDIndex, name)
        if err != nil {
                return "", errdefs.System(err)
        }
        if s == nil {
                return "", errdefs.NotFound(errors.New("name is not reserved"))
        }
        return s.(nameAssociation).containerID, nil
}

// GetAllNames returns all registered names.
func (v *View) GetAllNames() map[string][]string {
        iter, err := v.txn.Get(memdbNamesTable, memdbContainerIDIndex)
        if err != nil {
                return nil
        }

        out := make(map[string][]string)
        for {
                item := iter.Next()
                if item == nil {
                        break
                }
                assoc := item.(nameAssociation)
                out[assoc.containerID] = append(out[assoc.containerID], assoc.name)
        }

        return out
}

// transform maps a (deep) copied Container object to what queries need.
// A lock on the Container is not held because these are immutable deep copies.
func (v *View) transform(ctr *Container) *Snapshot {
        health := container.NoHealthcheck
        failingStreak := 0
        if ctr.Health != nil {
                health = ctr.Health.Status()
                failingStreak = ctr.Health.FailingStreak
        }

        healthSummary := &container.HealthSummary{
                Status:        health,
                FailingStreak: failingStreak,
        }

        snapshot := &Snapshot{
                Summary: container.Summary{
                        ID:      ctr.ID,
                        Names:   v.getNames(ctr.ID),
                        ImageID: ctr.ImageID.String(),
                        Ports:   []container.Port{},
                        Mounts:  ctr.GetMountPoints(),
                        State:   ctr.State.StateString(),
                        Status:  ctr.State.String(),
                        Health:  healthSummary,
                        Created: ctr.Created.Unix(),
                },
                CreatedAt:    ctr.Created,
                StartedAt:    ctr.StartedAt,
                Name:         ctr.Name,
                Pid:          ctr.Pid,
                Managed:      ctr.Managed,
                ExposedPorts: make(nat.PortSet),
                PortBindings: make(nat.PortSet),
                Health:       health,
                Running:      ctr.Running,
                Paused:       ctr.Paused,
                ExitCode:     ctr.ExitCode(),
        }

        if snapshot.Names == nil {
                // Dead containers will often have no name, so make sure the response isn't null
                snapshot.Names = []string{}
        }

        if ctr.HostConfig != nil {
                snapshot.Summary.HostConfig.NetworkMode = string(ctr.HostConfig.NetworkMode)
                snapshot.Summary.HostConfig.Annotations = maps.Clone(ctr.HostConfig.Annotations)
                snapshot.HostConfig.Isolation = string(ctr.HostConfig.Isolation)
                for binding := range ctr.HostConfig.PortBindings {
                        snapshot.PortBindings[binding] = struct{}{}
                }
        }

        if ctr.Config != nil {
                snapshot.Image = ctr.Config.Image
                snapshot.Labels = ctr.Config.Labels
                for exposed := range ctr.Config.ExposedPorts {
                        snapshot.ExposedPorts[exposed] = struct{}{}
                }
        }

        if len(ctr.Args) > 0 {
                var args []string
                for _, arg := range ctr.Args {
                        if strings.Contains(arg, " ") {
                                args = append(args, fmt.Sprintf("'%s'", arg))
                        } else {
                                args = append(args, arg)
                        }
                }
                argsAsString := strings.Join(args, " ")
                snapshot.Command = fmt.Sprintf("%s %s", ctr.Path, argsAsString)
        } else {
                snapshot.Command = ctr.Path
        }

        snapshot.Ports = []container.Port{}
        networks := make(map[string]*network.EndpointSettings)
        if ctr.NetworkSettings != nil {
                for name, netw := range ctr.NetworkSettings.Networks {
                        if netw == nil || netw.EndpointSettings == nil {
                                continue
                        }
                        networks[name] = &network.EndpointSettings{
                                EndpointID:          netw.EndpointID,
                                Gateway:             netw.Gateway,
                                IPAddress:           netw.IPAddress,
                                IPPrefixLen:         netw.IPPrefixLen,
                                IPv6Gateway:         netw.IPv6Gateway,
                                GlobalIPv6Address:   netw.GlobalIPv6Address,
                                GlobalIPv6PrefixLen: netw.GlobalIPv6PrefixLen,
                                MacAddress:          netw.MacAddress,
                                NetworkID:           netw.NetworkID,
                                GwPriority:          netw.GwPriority,
                        }
                        if netw.IPAMConfig != nil {
                                networks[name].IPAMConfig = &network.EndpointIPAMConfig{
                                        IPv4Address: netw.IPAMConfig.IPv4Address,
                                        IPv6Address: netw.IPAMConfig.IPv6Address,
                                }
                        }
                }
                for port, bindings := range ctr.NetworkSettings.Ports {
                        p, err := nat.ParsePort(port.Port())
                        if err != nil {
                                log.G(context.TODO()).WithError(err).Warn("invalid port map")
                                continue
                        }
                        if len(bindings) == 0 {
                                snapshot.Ports = append(snapshot.Ports, container.Port{
                                        PrivatePort: uint16(p),
                                        Type:        port.Proto(),
                                })
                                continue
                        }
                        for _, binding := range bindings {
                                h, err := nat.ParsePort(binding.HostPort)
                                if err != nil {
                                        log.G(context.TODO()).WithError(err).Warn("invalid host port map")
                                        continue
                                }
                                snapshot.Ports = append(snapshot.Ports, container.Port{
                                        PrivatePort: uint16(p),
                                        PublicPort:  uint16(h),
                                        Type:        port.Proto(),
                                        IP:          binding.HostIP,
                                })
                        }
                }
        }
        snapshot.NetworkSettings = &container.NetworkSettingsSummary{Networks: networks}

        if ctr.ImageManifest != nil {
                imageManifest := *ctr.ImageManifest
                if imageManifest.Platform == nil {
                        imageManifest.Platform = &ctr.ImagePlatform
                }
                snapshot.Summary.ImageManifestDescriptor = &imageManifest
        }

        return snapshot
}

// containerByIDIndexer is used to extract the ID field from Container types.
// memdb.StringFieldIndex can not be used since ID is a field from an embedded struct.
type containerByIDIndexer struct{}

// terminator is the null character, used as a terminator.
const terminator = "\x00"

// FromObject implements the memdb.SingleIndexer interface for Container objects
func (e *containerByIDIndexer) FromObject(obj any) (bool, []byte, error) {
        c, ok := obj.(*Container)
        if !ok {
                return false, nil, fmt.Errorf("%T is not a Container", obj)
        }
        // Add the null character as a terminator
        return true, []byte(c.ID + terminator), nil
}

// FromArgs implements the memdb.Indexer interface
func (e *containerByIDIndexer) FromArgs(args ...any) ([]byte, error) {
        if len(args) != 1 {
                return nil, errors.New("must provide only a single argument")
        }
        arg, ok := args[0].(string)
        if !ok {
                return nil, fmt.Errorf("argument must be a string: %#v", args[0])
        }
        // Add the null character as a terminator
        return []byte(arg + terminator), nil
}

func (e *containerByIDIndexer) PrefixFromArgs(args ...any) ([]byte, error) {
        val, err := e.FromArgs(args...)
        if err != nil {
                return nil, err
        }

        // Strip the null terminator, the rest is a prefix
        return bytes.TrimSuffix(val, []byte(terminator)), nil
}

// namesByNameIndexer is used to index container name associations by name.
type namesByNameIndexer struct{}

func (e *namesByNameIndexer) FromObject(obj any) (bool, []byte, error) {
        n, ok := obj.(nameAssociation)
        if !ok {
                return false, nil, fmt.Errorf(`%T does not have type "nameAssociation"`, obj)
        }

        // Add the null character as a terminator
        return true, []byte(n.name + terminator), nil
}

func (e *namesByNameIndexer) FromArgs(args ...any) ([]byte, error) {
        if len(args) != 1 {
                return nil, errors.New("must provide only a single argument")
        }
        arg, ok := args[0].(string)
        if !ok {
                return nil, fmt.Errorf("argument must be a string: %#v", args[0])
        }
        // Add the null character as a terminator
        return []byte(arg + terminator), nil
}

// namesByContainerIDIndexer is used to index container names by container ID.
type namesByContainerIDIndexer struct{}

func (e *namesByContainerIDIndexer) FromObject(obj any) (bool, []byte, error) {
        n, ok := obj.(nameAssociation)
        if !ok {
                return false, nil, fmt.Errorf(`%T does not have type "nameAssociation"`, obj)
        }

        // Add the null character as a terminator
        return true, []byte(n.containerID + terminator), nil
}

func (e *namesByContainerIDIndexer) FromArgs(args ...any) ([]byte, error) {
        if len(args) != 1 {
                return nil, errors.New("must provide only a single argument")
        }
        arg, ok := args[0].(string)
        if !ok {
                return nil, fmt.Errorf("argument must be a string: %#v", args[0])
        }
        // Add the null character as a terminator
        return []byte(arg + terminator), nil
}

package events

import (
        "sync"
        "time"

        "github.com/docker/docker/daemon/internal/metrics"
        eventtypes "github.com/moby/moby/api/types/events"
        "github.com/moby/pubsub"
)

const (
        eventsLimit = 256
        bufferSize  = 1024
)

// Events is pubsub channel for events generated by the engine.
type Events struct {
        mu     sync.Mutex
        events []eventtypes.Message
        pub    *pubsub.Publisher
}

// New returns new *Events instance
func New() *Events {
        return &Events{
                events: make([]eventtypes.Message, 0, eventsLimit),
                pub:    pubsub.NewPublisher(100*time.Millisecond, bufferSize),
        }
}

// Subscribe adds new listener to events, returns slice of 256 stored
// last events, a channel in which you can expect new events (in form
// of interface{}, so you need type assertion), and a function to call
// to stop the stream of events.
func (e *Events) Subscribe() ([]eventtypes.Message, chan interface{}, func()) {
        metrics.EventSubscribers.Inc()
        e.mu.Lock()
        current := make([]eventtypes.Message, len(e.events))
        copy(current, e.events)
        l := e.pub.Subscribe()
        e.mu.Unlock()

        cancel := func() {
                e.Evict(l)
        }
        return current, l, cancel
}

// SubscribeTopic adds new listener to events, returns slice of 256 stored
// last events, a channel in which you can expect new events (in form
// of interface{}, so you need type assertion).
func (e *Events) SubscribeTopic(since, until time.Time, ef *Filter) ([]eventtypes.Message, chan interface{}) {
        metrics.EventSubscribers.Inc()
        e.mu.Lock()

        var topic func(m interface{}) bool
        if ef != nil && ef.filter.Len() > 0 {
                topic = func(m interface{}) bool { return ef.Include(m.(eventtypes.Message)) }
        }

        buffered := e.loadBufferedEvents(since, until, topic)

        var ch chan interface{}
        if topic != nil {
                ch = e.pub.SubscribeTopic(topic)
        } else {
                // Subscribe to all events if there are no filters
                ch = e.pub.Subscribe()
        }

        e.mu.Unlock()
        return buffered, ch
}

// Evict evicts listener from pubsub
func (e *Events) Evict(l chan interface{}) {
        metrics.EventSubscribers.Dec()
        e.pub.Evict(l)
}

// Log creates a local scope message and publishes it
func (e *Events) Log(action eventtypes.Action, eventType eventtypes.Type, actor eventtypes.Actor) {
        now := time.Now().UTC()
        jm := eventtypes.Message{
                Action:   action,
                Type:     eventType,
                Actor:    actor,
                Scope:    "local",
                Time:     now.Unix(),
                TimeNano: now.UnixNano(),
        }

        // fill deprecated fields for container and images
        switch eventType {
        case eventtypes.ContainerEventType:
                jm.ID = actor.ID                    //nolint:staticcheck // ignore SA1019: field is deprecated but set for backward compatibility.
                jm.Status = string(action)          //nolint:staticcheck // ignore SA1019: field is deprecated but set for backward compatibility.
                jm.From = actor.Attributes["image"] //nolint:staticcheck // ignore SA1019: field is deprecated but set for backward compatibility.
        case eventtypes.ImageEventType:
                jm.ID = actor.ID           //nolint:staticcheck // ignore SA1019: field is deprecated but set for backward compatibility.
                jm.Status = string(action) //nolint:staticcheck // ignore SA1019: field is deprecated but set for backward compatibility.
        default:
                // TODO(thaJeztah): make switch exhaustive
        }

        e.PublishMessage(jm)
}

// PublishMessage broadcasts event to listeners. Each listener has 100 milliseconds to
// receive the event or it will be skipped.
func (e *Events) PublishMessage(jm eventtypes.Message) {
        metrics.EventsCounter.Inc()

        e.mu.Lock()
        if len(e.events) == cap(e.events) {
                // discard oldest event
                copy(e.events, e.events[1:])
                e.events[len(e.events)-1] = jm
        } else {
                e.events = append(e.events, jm)
        }
        e.mu.Unlock()
        e.pub.Publish(jm)
}

// SubscribersCount returns number of event listeners
func (e *Events) SubscribersCount() int {
        return e.pub.Len()
}

// loadBufferedEvents iterates over the cached events in the buffer
// and returns those that were emitted between two specific dates.
// It uses `time.Unix(seconds, nanoseconds)` to generate valid dates with those arguments.
// It filters those buffered messages with a topic function if it's not nil, otherwise it adds all messages.
func (e *Events) loadBufferedEvents(since, until time.Time, topic func(interface{}) bool) []eventtypes.Message {
        var buffered []eventtypes.Message
        if since.IsZero() && until.IsZero() {
                return buffered
        }

        var sinceNanoUnix int64
        if !since.IsZero() {
                sinceNanoUnix = since.UnixNano()
        }

        var untilNanoUnix int64
        if !until.IsZero() {
                untilNanoUnix = until.UnixNano()
        }

        for i := len(e.events) - 1; i >= 0; i-- {
                ev := e.events[i]

                if ev.TimeNano < sinceNanoUnix {
                        break
                }

                if untilNanoUnix > 0 && ev.TimeNano > untilNanoUnix {
                        continue
                }

                if topic == nil || topic(ev) {
                        buffered = append([]eventtypes.Message{ev}, buffered...)
                }
        }
        return buffered
}

package events

import (
        "github.com/distribution/reference"
        "github.com/moby/moby/api/types/events"
        "github.com/moby/moby/api/types/filters"
)

// Filter can filter out docker events from a stream
type Filter struct {
        filter filters.Args
}

// NewFilter creates a new Filter
func NewFilter(filter filters.Args) *Filter {
        return &Filter{filter: filter}
}

// Include returns true when the event ev is included by the filters
func (ef *Filter) Include(ev events.Message) bool {
        return ef.matchEvent(ev) &&
                ef.filter.ExactMatch("type", string(ev.Type)) &&
                ef.matchScope(ev.Scope) &&
                ef.matchDaemon(ev) &&
                ef.matchContainer(ev) &&
                ef.matchPlugin(ev) &&
                ef.matchVolume(ev) &&
                ef.matchNetwork(ev) &&
                ef.matchImage(ev) &&
                ef.matchNode(ev) &&
                ef.matchService(ev) &&
                ef.matchSecret(ev) &&
                ef.matchConfig(ev) &&
                ef.matchLabels(ev.Actor.Attributes)
}

func (ef *Filter) matchEvent(ev events.Message) bool {
        // #25798 if an event filter contains either health_status, exec_create or exec_start without a colon
        // Let's to a FuzzyMatch instead of an ExactMatch.
        if ef.filterContains("event", map[string]struct{}{"health_status": {}, "exec_create": {}, "exec_start": {}}) {
                return ef.filter.FuzzyMatch("event", string(ev.Action))
        }
        return ef.filter.ExactMatch("event", string(ev.Action))
}

func (ef *Filter) filterContains(field string, values map[string]struct{}) bool {
        for _, v := range ef.filter.Get(field) {
                if _, ok := values[v]; ok {
                        return true
                }
        }
        return false
}

func (ef *Filter) matchScope(scope string) bool {
        if !ef.filter.Contains("scope") {
                return true
        }
        return ef.filter.ExactMatch("scope", scope)
}

func (ef *Filter) matchLabels(attributes map[string]string) bool {
        if !ef.filter.Contains("label") {
                return true
        }
        return ef.filter.MatchKVList("label", attributes)
}

func (ef *Filter) matchDaemon(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.DaemonEventType)
}

func (ef *Filter) matchContainer(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.ContainerEventType)
}

func (ef *Filter) matchPlugin(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.PluginEventType)
}

func (ef *Filter) matchVolume(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.VolumeEventType)
}

func (ef *Filter) matchNetwork(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.NetworkEventType)
}

func (ef *Filter) matchService(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.ServiceEventType)
}

func (ef *Filter) matchNode(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.NodeEventType)
}

func (ef *Filter) matchSecret(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.SecretEventType)
}

func (ef *Filter) matchConfig(ev events.Message) bool {
        return ef.fuzzyMatchName(ev, events.ConfigEventType)
}

func (ef *Filter) fuzzyMatchName(ev events.Message, eventType events.Type) bool {
        return ef.filter.FuzzyMatch(string(eventType), ev.Actor.ID) || ef.filter.FuzzyMatch(string(eventType), ev.Actor.Attributes["name"])
}

// matchImage matches against both event.Actor.ID (for image events)
// and event.Actor.Attributes["image"] (for container events), so that any container that was created
// from an image will be included in the image events. Also compare both
// against the stripped repo name without any tags.
func (ef *Filter) matchImage(ev events.Message) bool {
        id := ev.Actor.ID
        nameAttr := "image"
        var imageName string

        if ev.Type == events.ImageEventType {
                nameAttr = "name"
        }

        if n, ok := ev.Actor.Attributes[nameAttr]; ok {
                imageName = n
        }
        return ef.filter.ExactMatch("image", id) ||
                ef.filter.ExactMatch("image", imageName) ||
                ef.filter.ExactMatch("image", stripTag(id)) ||
                ef.filter.ExactMatch("image", stripTag(imageName))
}

func stripTag(image string) string {
        ref, err := reference.ParseNormalizedNamed(image)
        if err != nil {
                return image
        }
        return reference.FamiliarName(ref)
}

package graphdriver

import (
        "context"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "strings"

        "github.com/containerd/log"
        "github.com/moby/go-archive"
        "github.com/moby/sys/user"
        "github.com/pkg/errors"
        "github.com/vbatts/tar-split/tar/storage"
)

// All registered drivers
var drivers = make(map[string]InitFunc)

// CreateOpts contains optional arguments for Create() and CreateReadWrite()
// methods.
type CreateOpts struct {
        MountLabel string
        StorageOpt map[string]string
}

// InitFunc initializes the storage driver.
type InitFunc func(root string, options []string, idMap user.IdentityMapping) (Driver, error)

// ProtoDriver defines the basic capabilities of a driver.
// This interface exists solely to be a minimum set of methods
// for client code which choose not to implement the entire Driver
// interface and use the NaiveDiffDriver wrapper constructor.
//
// Use of ProtoDriver directly by client code is not recommended.
type ProtoDriver interface {
        // String returns a string representation of this driver.
        String() string
        // CreateReadWrite creates a new, empty filesystem layer that is ready
        // to be used as the storage for a container. Additional options can
        // be passed in opts. parent may be "" and opts may be nil.
        CreateReadWrite(id, parent string, opts *CreateOpts) error
        // Create creates a new, empty, filesystem layer with the
        // specified id and parent and options passed in opts. Parent
        // may be "" and opts may be nil.
        Create(id, parent string, opts *CreateOpts) error
        // Remove attempts to remove the filesystem layer with this id.
        Remove(id string) error
        // Get returns the mountpoint for the layered filesystem referred
        // to by this id. You can optionally specify a mountLabel or "".
        // Returns the absolute path to the mounted layered filesystem.
        Get(id, mountLabel string) (fs string, err error)
        // Put releases the system resources for the specified id,
        // e.g, unmounting layered filesystem.
        Put(id string) error
        // Exists returns whether a filesystem layer with the specified
        // ID exists on this driver.
        Exists(id string) bool
        // Status returns a set of key-value pairs which give low
        // level diagnostic status about this driver.
        Status() [][2]string
        // GetMetadata returns a set of key-value pairs which give driver-specific
        // low-level information about the image/container that the driver is managing.
        GetMetadata(id string) (map[string]string, error)
        // Cleanup performs necessary tasks to release resources
        // held by the driver, e.g., unmounting all layered filesystems
        // known to this driver.
        Cleanup() error
}

// DiffDriver is the interface to use to implement graph diffs
type DiffDriver interface {
        // Diff produces an archive of the changes between the specified
        // layer and its parent layer which may be "".
        Diff(id, parent string) (io.ReadCloser, error)
        // Changes produces a list of changes between the specified layer
        // and its parent layer. If parent is "", then all changes will be ADD changes.
        Changes(id, parent string) ([]archive.Change, error)
        // ApplyDiff extracts the changeset from the given diff into the
        // layer with the specified id and parent, returning the size of the
        // new layer in bytes.
        // The archive.Reader must be an uncompressed stream.
        ApplyDiff(id, parent string, diff io.Reader) (size int64, err error)
        // DiffSize calculates the changes between the specified id
        // and its parent and returns the size in bytes of the changes
        // relative to its base filesystem directory.
        DiffSize(id, parent string) (size int64, err error)
}

// Driver is the interface for layered/snapshot file system drivers.
type Driver interface {
        ProtoDriver
        DiffDriver
}

// DiffGetterDriver is the interface for layered file system drivers that
// provide a specialized function for getting file contents for tar-split.
type DiffGetterDriver interface {
        Driver
        // DiffGetter returns an interface to efficiently retrieve the contents
        // of files in a layer.
        DiffGetter(id string) (FileGetCloser, error)
}

// FileGetCloser extends the storage.FileGetter interface with a Close method
// for cleaning up.
type FileGetCloser interface {
        storage.FileGetter
        // Close cleans up any resources associated with the FileGetCloser.
        Close() error
}

// Register registers an InitFunc for the driver.
func Register(name string, initFunc InitFunc) error {
        if _, exists := drivers[name]; exists {
                return errors.Errorf("name already registered %s", name)
        }
        drivers[name] = initFunc

        return nil
}

// getDriver initializes and returns the registered driver.
func getDriver(name string, config Options) (Driver, error) {
        if initFunc, exists := drivers[name]; exists {
                return initFunc(filepath.Join(config.Root, name), config.DriverOptions, config.IDMap)
        }
        log.G(context.TODO()).WithFields(log.Fields{"driver": name, "home-dir": config.Root}).Error("Failed to GetDriver graph")

        return nil, ErrNotSupported
}

// Options is used to initialize a graphdriver
type Options struct {
        Root                string
        DriverOptions       []string
        IDMap               user.IdentityMapping
        ExperimentalEnabled bool
}

// New creates the driver and initializes it at the specified root.
//
// It is recommended to pass a name for the driver to use, but If no name
// is provided, it attempts to detect the prior storage driver based on
// existing state, or otherwise selects a storage driver based on a priority
// list and the underlying filesystem.
//
// It returns an error if the requested storage driver is not supported,
// if scanning prior drivers is ambiguous (i.e., if state is found for
// multiple drivers), or if no compatible driver is available for the
// platform and underlying filesystem.
func New(driverName string, config Options) (Driver, error) {
        ctx := context.TODO()
        if driverName != "" {
                log.G(ctx).Infof("[graphdriver] trying configured driver: %s", driverName)
                if err := checkRemoved(driverName); err != nil {
                        return nil, err
                }
                return getDriver(driverName, config)
        }

        // Guess for prior driver
        //
        // TODO(thaJeztah): move detecting prior drivers separate from New(), and make "name" a required argument.
        driversMap := scanPriorDrivers(config.Root)
        priorityList := strings.Split(priority, ",")
        log.G(ctx).Debugf("[graphdriver] priority list: %v", priorityList)
        for _, name := range priorityList {
                if _, prior := driversMap[name]; prior {
                        // of the state found from prior drivers, check in order of our priority
                        // which we would prefer
                        driver, err := getDriver(name, config)
                        if err != nil {
                                // unlike below, we will return error here, because there is prior
                                // state, and now it is no longer supported/prereq/compatible, so
                                // something changed and needs attention. Otherwise the daemon's
                                // images would just "disappear".
                                log.G(ctx).Errorf("[graphdriver] prior storage driver %s failed: %s", name, err)
                                return nil, err
                        }

                        // abort starting when there are other prior configured drivers
                        // to ensure the user explicitly selects the driver to load
                        if len(driversMap) > 1 {
                                var driversSlice []string
                                for d := range driversMap {
                                        driversSlice = append(driversSlice, d)
                                }

                                err = errors.Errorf("%s contains several valid graphdrivers: %s; cleanup or explicitly choose storage driver (-s <DRIVER>)", config.Root, strings.Join(driversSlice, ", "))
                                log.G(ctx).Errorf("[graphdriver] %v", err)
                                return nil, err
                        }

                        log.G(ctx).Infof("[graphdriver] using prior storage driver: %s", name)
                        return driver, nil
                }
        }

        // If no prior state was found, continue with automatic selection, and pick
        // the first supported, non-deprecated, storage driver (in order of priorityList).
        for _, name := range priorityList {
                driver, err := getDriver(name, config)
                if err != nil {
                        if IsDriverNotSupported(err) {
                                continue
                        }
                        return nil, err
                }
                return driver, nil
        }

        // Check all registered drivers if no priority driver is found
        for name, initFunc := range drivers {
                driver, err := initFunc(filepath.Join(config.Root, name), config.DriverOptions, config.IDMap)
                if err != nil {
                        if IsDriverNotSupported(err) {
                                continue
                        }
                        return nil, err
                }
                return driver, nil
        }

        return nil, errors.Errorf("no supported storage driver found")
}

// scanPriorDrivers returns an un-ordered scan of directories of prior storage
// drivers. The 'vfs' storage driver is not taken into account, and ignored.
func scanPriorDrivers(root string) map[string]bool {
        driversMap := make(map[string]bool)

        for driver := range drivers {
                p := filepath.Join(root, driver)
                if _, err := os.Stat(p); err == nil && driver != "vfs" {
                        if !isEmptyDir(p) {
                                driversMap[driver] = true
                        }
                }
        }
        return driversMap
}

// isEmptyDir checks if a directory is empty. It is used to check if prior
// storage-driver directories exist. If an error occurs, it also assumes the
// directory is not empty (which preserves the behavior _before_ this check
// was added)
func isEmptyDir(name string) bool {
        f, err := os.Open(name)
        if err != nil {
                return false
        }
        defer f.Close()

        if _, err = f.Readdirnames(1); errors.Is(err, io.EOF) {
                return true
        }
        return false
}

// checkRemoved checks if a storage-driver has been deprecated (and removed)
func checkRemoved(name string) error {
        switch name {
        case "aufs", "devicemapper", "overlay":
                return NotSupportedError(fmt.Sprintf("[graphdriver] ERROR: the %s storage-driver has been deprecated and removed; visit https://docs.docker.com/go/storage-driver/ for more information", name))
        }
        return nil
}

package graphdriver

const (
        // ErrNotSupported returned when driver is not supported.
        ErrNotSupported NotSupportedError = "driver not supported"
        // ErrPrerequisites returned when driver does not meet prerequisites.
        ErrPrerequisites NotSupportedError = "prerequisites for driver not satisfied (wrong filesystem?)"
        // ErrIncompatibleFS returned when file system is not supported.
        ErrIncompatibleFS NotSupportedError = "backing file system is unsupported for this graph driver"
)

// ErrUnSupported signals that the graph-driver is not supported on the current configuration
type ErrUnSupported interface {
        NotSupported()
}

// NotSupportedError signals that the graph-driver is not supported on the current configuration
type NotSupportedError string

func (e NotSupportedError) Error() string {
        return string(e)
}

// NotSupported signals that a graph-driver is not supported.
func (e NotSupportedError) NotSupported() {}

// IsDriverNotSupported returns true if the error initializing
// the graph driver is a non-supported error.
func IsDriverNotSupported(err error) bool {
        switch err.(type) {
        case ErrUnSupported:
                return true
        default:
                return false
        }
}

package graphdriver

import (
        "context"
        "io"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/moby/go-archive"
        "github.com/moby/go-archive/chrootarchive"
        "github.com/moby/sys/user"
)

// ApplyUncompressedLayer defines the unpack method used by the graph
// driver.
var ApplyUncompressedLayer = chrootarchive.ApplyUncompressedLayer

// NaiveDiffDriver takes a ProtoDriver and adds the
// capability of the Diffing methods on the local file system,
// which it may or may not support on its own. See the comment
// on the exported NewNaiveDiffDriver function below.
type NaiveDiffDriver struct {
        ProtoDriver
        IDMap user.IdentityMapping
        // If true, allow ApplyDiff to succeed in spite of failures to set
        // extended attributes on the unpacked files due to the destination
        // filesystem not supporting them or a lack of permissions. The
        // resulting unpacked layer may be subtly broken.
        BestEffortXattrs bool
}

// NewNaiveDiffDriver returns a fully functional driver that wraps the
// given ProtoDriver and adds the capability of the following methods which
// it may or may not support on its own:
//
//        Diff(id, parent string) (archive.Archive, error)
//        Changes(id, parent string) ([]archive.Change, error)
//        ApplyDiff(id, parent string, diff archive.Reader) (size int64, err error)
//        DiffSize(id, parent string) (size int64, err error)
func NewNaiveDiffDriver(driver ProtoDriver, idMap user.IdentityMapping) Driver {
        return &NaiveDiffDriver{
                ProtoDriver: driver,
                IDMap:       idMap,
        }
}

// Diff produces an archive of the changes between the specified
// layer and its parent layer which may be "".
func (gdw *NaiveDiffDriver) Diff(id, parent string) (arch io.ReadCloser, retErr error) {
        startTime := time.Now()
        driver := gdw.ProtoDriver

        layerRootFs, err := driver.Get(id, "")
        if err != nil {
                return nil, err
        }
        layerFs := layerRootFs

        defer func() {
                if retErr != nil {
                        _ = driver.Put(id)
                }
        }()

        if parent == "" {
                tarArchive, err := archive.Tar(layerFs, archive.Uncompressed)
                if err != nil {
                        return nil, err
                }
                return ioutils.NewReadCloserWrapper(tarArchive, func() error {
                        err := tarArchive.Close()
                        driver.Put(id)
                        return err
                }), nil
        }

        parentFs, err := driver.Get(parent, "")
        if err != nil {
                return nil, err
        }
        defer driver.Put(parent)

        changes, err := archive.ChangesDirs(layerFs, parentFs)
        if err != nil {
                return nil, err
        }

        tarArchive, err := archive.ExportChanges(layerFs, changes, gdw.IDMap)
        if err != nil {
                return nil, err
        }

        return ioutils.NewReadCloserWrapper(tarArchive, func() error {
                err := tarArchive.Close()
                driver.Put(id)

                // NaiveDiffDriver compares file metadata with parent layers. Parent layers
                // are extracted from tar's with full second precision on modified time.
                // We need this hack here to make sure calls within same second receive
                // correct result.
                time.Sleep(time.Until(startTime.Truncate(time.Second).Add(time.Second)))
                return err
        }), nil
}

// Changes produces a list of changes between the specified layer
// and its parent layer. If parent is "", then all changes will be ADD changes.
func (gdw *NaiveDiffDriver) Changes(id, parent string) ([]archive.Change, error) {
        driver := gdw.ProtoDriver

        layerFs, err := driver.Get(id, "")
        if err != nil {
                return nil, err
        }
        defer driver.Put(id)

        parentFs := ""

        if parent != "" {
                parentFs, err = driver.Get(parent, "")
                if err != nil {
                        return nil, err
                }
                defer driver.Put(parent)
        }

        return archive.ChangesDirs(layerFs, parentFs)
}

// ApplyDiff extracts the changeset from the given diff into the
// layer with the specified id and parent, returning the size of the
// new layer in bytes.
func (gdw *NaiveDiffDriver) ApplyDiff(id, parent string, diff io.Reader) (size int64, _ error) {
        driver := gdw.ProtoDriver

        // Mount the root filesystem so we can apply the diff/layer.
        layerRootFs, err := driver.Get(id, "")
        if err != nil {
                return 0, err
        }
        defer driver.Put(id)

        layerFs := layerRootFs
        options := &archive.TarOptions{IDMap: gdw.IDMap, BestEffortXattrs: gdw.BestEffortXattrs}
        start := time.Now().UTC()
        log.G(context.TODO()).WithField("id", id).Debug("Start untar layer")
        appliedSize, err := ApplyUncompressedLayer(layerFs, diff, options)
        if err != nil {
                return 0, err
        }
        log.G(context.TODO()).WithField("id", id).Debugf("Untar time: %vs", time.Now().UTC().Sub(start).Seconds())
        return appliedSize, nil
}

// DiffSize calculates the changes between the specified layer
// and its parent and returns the size in bytes of the changes
// relative to its base filesystem directory.
func (gdw *NaiveDiffDriver) DiffSize(id, parent string) (size int64, _ error) {
        driver := gdw.ProtoDriver

        changes, err := gdw.Changes(id, parent)
        if err != nil {
                return 0, err
        }

        layerFs, err := driver.Get(id, "")
        if err != nil {
                return 0, err
        }
        defer driver.Put(id)

        return archive.ChangesSize(layerFs, changes), nil
}

package graphdriver

import (
        "fmt"
        "strings"
)

// ParseStorageOptKeyValue parses and validates the specified string as a key/value
// pair (key=value).
func ParseStorageOptKeyValue(opt string) (key string, value string, err error) {
        k, v, ok := strings.Cut(opt, "=")
        if !ok {
                return "", "", fmt.Errorf("unable to parse storage-opt key/value: %s", opt)
        }
        return strings.TrimSpace(k), strings.TrimSpace(v), nil
}

package images

import (
        "context"
        "encoding/json"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/image"
        "github.com/docker/docker/image/cache"
        "github.com/docker/docker/layer"
        "github.com/moby/moby/api/types/backend"
)

type cacheAdaptor struct {
        is *ImageService
}

func (c cacheAdaptor) Get(id image.ID) (*image.Image, error) {
        return c.is.imageStore.Get(id)
}

func (c cacheAdaptor) GetByRef(ctx context.Context, refOrId string) (*image.Image, error) {
        return c.is.GetImage(ctx, refOrId, backend.GetImageOpts{})
}

func (c cacheAdaptor) SetParent(target, parent image.ID) error {
        return c.is.imageStore.SetParent(target, parent)
}

func (c cacheAdaptor) GetParent(target image.ID) (image.ID, error) {
        return c.is.imageStore.GetParent(target)
}

func (c cacheAdaptor) IsBuiltLocally(target image.ID) (bool, error) {
        return c.is.imageStore.IsBuiltLocally(target)
}

func (c cacheAdaptor) Children(imgID image.ID) []image.ID {
        // Not FROM scratch
        if imgID != "" {
                return c.is.imageStore.Children(imgID)
        }
        images := c.is.imageStore.Map()

        var siblings []image.ID
        for id, img := range images {
                if img.Parent != "" {
                        continue
                }

                builtLocally, err := c.is.imageStore.IsBuiltLocally(id)
                if err != nil {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "error": err,
                                "id":    id,
                        }).Warn("failed to check if image was built locally")
                        continue
                }
                if !builtLocally {
                        continue
                }

                siblings = append(siblings, id)
        }
        return siblings
}

func (c cacheAdaptor) Create(parent *image.Image, image image.Image, _ layer.DiffID) (image.ID, error) {
        data, err := json.Marshal(image)
        if err != nil {
                return "", fmt.Errorf("failed to marshal image config: %w", err)
        }
        imgID, err := c.is.imageStore.Create(data)
        if err != nil {
                return "", err
        }

        if parent != nil {
                if err := c.is.imageStore.SetParent(imgID, parent.ID()); err != nil {
                        return "", fmt.Errorf("failed to set parent for %v to %v: %w", imgID, parent.ID(), err)
                }
        }

        return imgID, err
}

// MakeImageCache creates a stateful image cache.
func (i *ImageService) MakeImageCache(ctx context.Context, sourceRefs []string) (builder.ImageCache, error) {
        return cache.New(ctx, cacheAdaptor{i}, sourceRefs)
}

package images

import (
        "context"
        "encoding/json"
        "fmt"
        "io"

        "github.com/containerd/containerd/v2/core/content"
        c8dimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/leases"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/distribution/reference"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/image"
        "github.com/moby/moby/api/types/backend"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// ErrImageDoesNotExist is error returned when no image can be found for a reference.
type ErrImageDoesNotExist struct {
        Ref reference.Reference
}

func (e ErrImageDoesNotExist) Error() string {
        ref := e.Ref
        if named, ok := ref.(reference.Named); ok {
                ref = reference.TagNameOnly(named)
        }
        return fmt.Sprintf("No such image: %s", reference.FamiliarString(ref))
}

// NotFound implements the NotFound interface
func (e ErrImageDoesNotExist) NotFound() {}

type manifestList struct {
        Manifests []ocispec.Descriptor `json:"manifests"`
}

type manifest struct {
        Config ocispec.Descriptor `json:"config"`
}

func (i *ImageService) manifestMatchesPlatform(ctx context.Context, img *image.Image, platform ocispec.Platform) (bool, error) {
        ls, err := i.leases.ListResources(ctx, leases.Lease{ID: imageKey(img.ID().String())})
        if err != nil {
                if cerrdefs.IsNotFound(err) {
                        return false, nil
                }
                log.G(ctx).WithFields(log.Fields{
                        "error":           err,
                        "image":           img.ID,
                        "desiredPlatform": platforms.FormatAll(platform),
                }).Error("Error looking up image leases")
                return false, err
        }

        // Note we are comparing against manifest lists here, which we expect to always have a CPU variant set (where applicable).
        // So there is no need for the fallback matcher here.
        comparer := platforms.Only(platform)

        var (
                ml manifestList
                m  manifest
        )

        makeRdr := func(ra content.ReaderAt) io.Reader {
                return io.LimitReader(io.NewSectionReader(ra, 0, ra.Size()), 1e6)
        }

        for _, r := range ls {
                logger := log.G(ctx).WithFields(log.Fields{
                        "image":           img.ID,
                        "desiredPlatform": platforms.FormatAll(platform),
                        "resourceID":      r.ID,
                        "resourceType":    r.Type,
                })
                logger.Debug("Checking lease resource for platform match")
                if r.Type != "content" {
                        continue
                }

                ra, err := i.content.ReaderAt(ctx, ocispec.Descriptor{Digest: digest.Digest(r.ID)})
                if err != nil {
                        if cerrdefs.IsNotFound(err) {
                                continue
                        }
                        logger.WithError(err).Error("Error looking up referenced manifest list for image")
                        continue
                }

                data, err := io.ReadAll(makeRdr(ra))
                ra.Close()

                if err != nil {
                        logger.WithError(err).Error("Error reading manifest list for image")
                        continue
                }

                ml.Manifests = nil

                if err := json.Unmarshal(data, &ml); err != nil {
                        logger.WithError(err).Error("Error unmarshalling content")
                        continue
                }

                for _, md := range ml.Manifests {
                        switch md.MediaType {
                        case ocispec.MediaTypeImageManifest, c8dimages.MediaTypeDockerSchema2Manifest:
                        default:
                                continue
                        }

                        p := ocispec.Platform{
                                Architecture: md.Platform.Architecture,
                                OS:           md.Platform.OS,
                                Variant:      md.Platform.Variant,
                        }
                        if !comparer.Match(p) {
                                logger.WithField("otherPlatform", platforms.FormatAll(p)).Debug("Manifest is not a match")
                                continue
                        }

                        // Here we have a platform match for the referenced manifest, let's make sure the manifest is actually for the image config we are using.

                        ra, err := i.content.ReaderAt(ctx, ocispec.Descriptor{Digest: md.Digest})
                        if err != nil {
                                logger.WithField("otherDigest", md.Digest).WithError(err).Error("Could not get reader for manifest")
                                continue
                        }

                        data, err := io.ReadAll(makeRdr(ra))
                        ra.Close()
                        if err != nil {
                                logger.WithError(err).Error("Error reading manifest for image")
                                continue
                        }

                        if err := json.Unmarshal(data, &m); err != nil {
                                logger.WithError(err).Error("Error desserializing manifest")
                                continue
                        }

                        if m.Config.Digest == img.ID().Digest() {
                                logger.WithField("manifestDigest", md.Digest).Debug("Found matching manifest for image")
                                return true, nil
                        }

                        logger.WithField("otherDigest", md.Digest).Debug("Skipping non-matching manifest")
                }
        }

        return false, nil
}

// GetImage returns an image corresponding to the image referred to by refOrID.
func (i *ImageService) GetImage(ctx context.Context, refOrID string, options backend.GetImageOpts) (retImg *image.Image, retErr error) {
        defer func() {
                if retErr != nil || retImg == nil || options.Platform == nil {
                        return
                }

                imgPlat := ocispec.Platform{
                        OS:           retImg.OS,
                        Architecture: retImg.Architecture,
                        Variant:      retImg.Variant,
                }
                p := *options.Platform
                // Note that `platforms.Only` will fuzzy match this for us
                // For example: an armv6 image will run just fine on an armv7 CPU, without emulation or anything.
                if OnlyPlatformWithFallback(p).Match(imgPlat) {
                        return
                }
                // In some cases the image config can actually be wrong (e.g. classic `docker build` may not handle `--platform` correctly)
                // So we'll look up the manifest list that corresponds to this image to check if at least the manifest list says it is the correct image.
                var matches bool
                matches, retErr = i.manifestMatchesPlatform(ctx, retImg, p)
                if matches || retErr != nil {
                        return
                }

                // This allows us to tell clients that we don't have the image they asked for
                // Where this gets hairy is the image store does not currently support multi-arch images, e.g.:
                //   An image `foo` may have a multi-arch manifest, but the image store only fetches the image for a specific platform
                //   The image store does not store the manifest list and image tags are assigned to architecture specific images.
                //   So we can have a `foo` image that is amd64 but the user requested armv7. If the user looks at the list of images.
                //   This may be confusing.
                //   The alternative to this is to return an errdefs.Conflict error with a helpful message, but clients will not be
                //   able to automatically tell what causes the conflict.
                imgName := refOrID
                if ref, err := reference.ParseNamed(refOrID); err == nil {
                        imgName = reference.FamiliarString(ref)
                }
                retErr = errdefs.NotFound(errors.Errorf("image with reference %s was found but its platform (%s) does not match the specified platform (%s)", imgName, platforms.FormatAll(imgPlat), platforms.FormatAll(p)))
        }()
        ref, err := reference.ParseAnyReference(refOrID)
        if err != nil {
                return nil, errdefs.InvalidParameter(err)
        }
        namedRef, ok := ref.(reference.Named)
        if !ok {
                digested, ok := ref.(reference.Digested)
                if !ok {
                        return nil, ErrImageDoesNotExist{Ref: ref}
                }
                if img, err := i.imageStore.Get(image.ID(digested.Digest())); err == nil {
                        return img, nil
                }
                return nil, ErrImageDoesNotExist{Ref: ref}
        }

        if dgst, err := i.referenceStore.Get(namedRef); err == nil {
                // Search the image stores to get the operating system, defaulting to host OS.
                if img, err := i.imageStore.Get(image.ID(dgst)); err == nil {
                        return img, nil
                }
        }

        // Search based on ID
        if id, err := i.imageStore.Search(refOrID); err == nil {
                img, err := i.imageStore.Get(id)
                if err != nil {
                        return nil, ErrImageDoesNotExist{Ref: ref}
                }
                return img, nil
        }

        return nil, ErrImageDoesNotExist{Ref: ref}
}

// OnlyPlatformWithFallback uses `platforms.Only` with a fallback to handle the case where the platform
// being matched does not have a CPU variant.
//
// The reason for this is that CPU variant is not even if the official image config spec as of this writing.
// See: https://github.com/opencontainers/image-spec/pull/809
// Since Docker tends to compare platforms from the image config, we need to handle this case.
func OnlyPlatformWithFallback(p ocispec.Platform) platforms.Matcher {
        return &onlyFallbackMatcher{only: platforms.Only(p), p: platforms.Normalize(p)}
}

type onlyFallbackMatcher struct {
        only platforms.Matcher
        p    ocispec.Platform
}

func (m *onlyFallbackMatcher) Match(other ocispec.Platform) bool {
        if m.only.Match(other) {
                // It matches, no reason to fallback
                return true
        }
        if other.Variant != "" {
                // If there is a variant then this fallback does not apply, and there is no match
                return false
        }

        // note that platforms.Normalize adds a default variant... which is the
        // whole problem with [platforms.Only], so we can't match on that.
        otherN := platforms.Normalize(other)
        return m.p.OS == otherN.OS && m.p.Architecture == otherN.Architecture
}

package images

import (
        "context"
        "io"
        "runtime"

        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/streamformatter"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/registry"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

type roLayer struct {
        released   bool
        layerStore layer.Store
        roLayer    layer.Layer
}

func (l *roLayer) ContentStoreDigest() digest.Digest {
        return ""
}

func (l *roLayer) DiffID() layer.DiffID {
        if l.roLayer == nil {
                return layer.DigestSHA256EmptyTar
        }
        return l.roLayer.DiffID()
}

func (l *roLayer) Release() error {
        if l.released {
                return nil
        }
        if l.roLayer != nil {
                metadata, err := l.layerStore.Release(l.roLayer)
                for _, m := range metadata {
                        log.G(context.TODO()).WithField("chainID", m.ChainID).Infof("release ROLayer: cleaned up layer %s", m.ChainID)
                }
                if err != nil {
                        return errors.Wrap(err, "failed to release ROLayer")
                }
        }
        l.roLayer = nil
        l.released = true
        return nil
}

func (l *roLayer) NewRWLayer() (builder.RWLayer, error) {
        var chainID layer.ChainID
        if l.roLayer != nil {
                chainID = l.roLayer.ChainID()
        }

        mountID := stringid.GenerateRandomID()
        newLayer, err := l.layerStore.CreateRWLayer(mountID, chainID, nil)
        if err != nil {
                return nil, errors.Wrap(err, "failed to create rwlayer")
        }

        rwLayer := &rwLayer{layerStore: l.layerStore, rwLayer: newLayer}

        fs, err := newLayer.Mount("")
        if err != nil {
                rwLayer.Release()
                return nil, err
        }

        rwLayer.fs = fs

        return rwLayer, nil
}

type rwLayer struct {
        released   bool
        layerStore layer.Store
        rwLayer    layer.RWLayer
        fs         string
}

func (l *rwLayer) Root() string {
        return l.fs
}

func (l *rwLayer) Commit() (builder.ROLayer, error) {
        stream, err := l.rwLayer.TarStream()
        if err != nil {
                return nil, err
        }
        defer stream.Close()

        var chainID layer.ChainID
        if parent := l.rwLayer.Parent(); parent != nil {
                chainID = parent.ChainID()
        }

        newLayer, err := l.layerStore.Register(stream, chainID)
        if err != nil {
                return nil, err
        }
        // TODO: An optimization would be to handle empty layers before returning
        return &roLayer{layerStore: l.layerStore, roLayer: newLayer}, nil
}

func (l *rwLayer) Release() error {
        if l.released {
                return nil
        }

        if l.fs != "" {
                if err := l.rwLayer.Unmount(); err != nil {
                        return errors.Wrap(err, "failed to unmount RWLayer")
                }
                l.fs = ""
        }

        metadata, err := l.layerStore.ReleaseRWLayer(l.rwLayer)
        for _, m := range metadata {
                log.G(context.TODO()).WithField("chainID", m.ChainID).Infof("release RWLayer: cleaned up layer %s", m.ChainID)
        }
        if err != nil {
                return errors.Wrap(err, "failed to release RWLayer")
        }
        l.released = true
        return nil
}

func newROLayerForImage(img *image.Image, layerStore layer.Store) (builder.ROLayer, error) {
        if img == nil || img.RootFS.ChainID() == "" {
                return &roLayer{layerStore: layerStore}, nil
        }
        // Hold a reference to the image layer so that it can't be removed before
        // it is released
        lyr, err := layerStore.Get(img.RootFS.ChainID())
        if err != nil {
                return nil, errors.Wrapf(err, "failed to get layer for image %s", img.ImageID())
        }
        return &roLayer{layerStore: layerStore, roLayer: lyr}, nil
}

// TODO: could this use the regular daemon PullImage ?
func (i *ImageService) pullForBuilder(ctx context.Context, name string, authConfigs map[string]registry.AuthConfig, output io.Writer, platform *ocispec.Platform) (*image.Image, error) {
        ref, err := reference.ParseNormalizedNamed(name)
        if err != nil {
                return nil, err
        }
        ref = reference.TagNameOnly(ref)

        pullRegistryAuth := &registry.AuthConfig{}
        if len(authConfigs) > 0 {
                // The request came with a full auth config, use it
                resolvedConfig := i.registryService.ResolveAuthConfig(authConfigs, ref)
                pullRegistryAuth = &resolvedConfig
        }

        if err := i.pullImageWithReference(ctx, ref, platform, nil, pullRegistryAuth, output); err != nil {
                return nil, err
        }

        img, err := i.GetImage(ctx, name, backend.GetImageOpts{Platform: platform})
        if cerrdefs.IsNotFound(err) && img != nil && platform != nil {
                imgPlat := ocispec.Platform{
                        OS:           img.OS,
                        Architecture: img.BaseImgArch(),
                        Variant:      img.BaseImgVariant(),
                }

                p := *platform
                if !platforms.Only(p).Match(imgPlat) {
                        po := streamformatter.NewJSONProgressOutput(output, false)
                        progress.Messagef(po, "", `
WARNING: Pulled image with specified platform (%s), but the resulting image's configured platform (%s) does not match.
This is most likely caused by a bug in the build system that created the fetched image (%s).
Please notify the image author to correct the configuration.`,
                                platforms.FormatAll(p), platforms.FormatAll(imgPlat), name,
                        )
                        log.G(ctx).WithError(err).WithField("image", name).Warn("Ignoring error about platform mismatch where the manifest list points to an image whose configuration does not match the platform in the manifest.")
                        err = nil
                }
        }
        return img, err
}

// GetImageAndReleasableLayer returns an image and releaseable layer for a reference or ID.
// Every call to GetImageAndReleasableLayer MUST call releasableLayer.Release() to prevent
// leaking of layers.
func (i *ImageService) GetImageAndReleasableLayer(ctx context.Context, refOrID string, opts backend.GetImageAndLayerOptions) (builder.Image, builder.ROLayer, error) {
        if refOrID == "" { // FROM scratch
                if runtime.GOOS == "windows" {
                        return nil, nil, errors.New(`"FROM scratch" is not supported on Windows`)
                }
                if opts.Platform != nil {
                        if err := image.CheckOS(opts.Platform.OS); err != nil {
                                return nil, nil, err
                        }
                }
                lyr, err := newROLayerForImage(nil, i.layerStore)
                return nil, lyr, err
        }

        if opts.PullOption != backend.PullOptionForcePull {
                img, err := i.GetImage(ctx, refOrID, backend.GetImageOpts{Platform: opts.Platform})
                if err != nil && opts.PullOption == backend.PullOptionNoPull {
                        return nil, nil, err
                }
                if err != nil && !cerrdefs.IsNotFound(err) {
                        return nil, nil, err
                }
                if img != nil {
                        if err := image.CheckOS(img.OperatingSystem()); err != nil {
                                return nil, nil, err
                        }
                        lyr, err := newROLayerForImage(img, i.layerStore)
                        return img, lyr, err
                }
        }

        img, err := i.pullForBuilder(ctx, refOrID, opts.AuthConfig, opts.Output, opts.Platform)
        if err != nil {
                return nil, nil, err
        }
        if err := image.CheckOS(img.OperatingSystem()); err != nil {
                return nil, nil, err
        }
        lyr, err := newROLayerForImage(img, i.layerStore)
        return img, lyr, err
}

// CreateImage creates a new image by adding a config and ID to the image store.
// This is similar to LoadImage() except that it receives JSON encoded bytes of
// an image instead of a tar archive.
func (i *ImageService) CreateImage(ctx context.Context, config []byte, parent string, _ digest.Digest) (builder.Image, error) {
        id, err := i.imageStore.Create(config)
        if err != nil {
                return nil, errors.Wrapf(err, "failed to create image")
        }

        if parent != "" {
                if err := i.imageStore.SetParent(id, image.ID(parent)); err != nil {
                        return nil, errors.Wrapf(err, "failed to set parent %s", parent)
                }
        }
        if err := i.imageStore.SetBuiltLocally(id); err != nil {
                return nil, errors.Wrapf(err, "failed to mark image %s as built locally", id)
        }

        return i.imageStore.Get(id)
}

package images

import (
        "context"
        "errors"
        "fmt"

        "github.com/docker/docker/daemon/container"
        "github.com/docker/docker/layer"
        "github.com/moby/go-archive"
)

func (i *ImageService) Changes(ctx context.Context, container *container.Container) ([]archive.Change, error) {
        container.Lock()
        defer container.Unlock()

        if container.RWLayer == nil {
                return nil, errors.New("RWLayer of container " + container.Name + " is unexpectedly nil")
        }
        rwLayer, ok := container.RWLayer.(layer.RWLayer)
        if !ok {
                return nil, fmt.Errorf("container %s has an unexpected RWLayer type: %T", container.Name, container.RWLayer)
        }
        return rwLayer.Changes()
}

package images

import (
        "context"
        "encoding/json"
        "io"

        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/events"
        "github.com/pkg/errors"
)

// CommitImage creates a new image from a commit config
func (i *ImageService) CommitImage(ctx context.Context, c backend.CommitConfig) (image.ID, error) {
        if err := ctx.Err(); err != nil {
                return "", err
        }

        rwTar, err := exportContainerRw(i.layerStore, c.ContainerID, c.ContainerMountLabel)
        if err != nil {
                return "", err
        }
        defer func() {
                if rwTar != nil {
                        rwTar.Close()
                }
        }()

        var parent *image.Image
        if c.ParentImageID == "" {
                parent = new(image.Image)
                parent.RootFS = image.NewRootFS()
        } else {
                parent, err = i.imageStore.Get(image.ID(c.ParentImageID))
                if err != nil {
                        return "", err
                }
        }

        l, err := i.layerStore.Register(rwTar, parent.RootFS.ChainID())
        if err != nil {
                return "", err
        }
        defer layer.ReleaseAndLog(i.layerStore, l)

        cc := image.ChildConfig{
                ContainerID:     c.ContainerID,
                Author:          c.Author,
                Comment:         c.Comment,
                ContainerConfig: c.ContainerConfig,
                Config:          c.Config,
                DiffID:          l.DiffID(),
        }
        config, err := json.Marshal(image.NewChildImage(parent, cc, c.ContainerOS))
        if err != nil {
                return "", err
        }

        id, err := i.imageStore.Create(config)
        if err != nil {
                return "", err
        }

        i.LogImageEvent(ctx, id.String(), id.String(), events.ActionCreate)

        if err := i.imageStore.SetBuiltLocally(id); err != nil {
                return "", err
        }

        if c.ParentImageID != "" {
                if err := i.imageStore.SetParent(id, image.ID(c.ParentImageID)); err != nil {
                        return "", err
                }
        }
        return id, nil
}

func exportContainerRw(layerStore layer.Store, id, mountLabel string) (arch io.ReadCloser, retErr error) {
        rwlayer, err := layerStore.GetRWLayer(id)
        if err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        _, _ = layerStore.ReleaseRWLayer(rwlayer)
                }
        }()

        // TODO: this mount call is not necessary as we assume that TarStream() should
        // mount the layer if needed. But the Diff() function for windows requests that
        // the layer should be mounted when calling it. So we reserve this mount call
        // until windows driver can implement Diff() interface correctly.
        if _, err := rwlayer.Mount(mountLabel); err != nil {
                return nil, err
        }

        archive, err := rwlayer.TarStream()
        if err != nil {
                _ = rwlayer.Unmount()
                return nil, err
        }
        return ioutils.NewReadCloserWrapper(archive, func() error {
                _ = archive.Close()
                err := rwlayer.Unmount()
                _, _ = layerStore.ReleaseRWLayer(rwlayer)
                return err
        }), nil
}

// CommitBuildStep is used by the builder to create an image for each step in
// the build.
//
// This method is different from CreateImageFromContainer:
//   - it doesn't attempt to validate container state
//   - it doesn't send a commit action to metrics
//   - it doesn't log a container commit event
//
// This is a temporary shim. Should be removed when builder stops using commit.
func (i *ImageService) CommitBuildStep(ctx context.Context, c backend.CommitConfig) (image.ID, error) {
        ctr := i.containers.Get(c.ContainerID)
        if ctr == nil {
                // TODO: use typed error
                return "", errors.Errorf("container not found: %s", c.ContainerID)
        }
        c.ContainerMountLabel = ctr.MountLabel
        c.ContainerOS = ctr.ImagePlatform.OS
        c.ParentImageID = string(ctr.ImageID)
        return i.CommitImage(ctx, c)
}

package images

import (
        "context"
        "fmt"
        "strings"
        "time"

        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/container"
        "github.com/docker/docker/daemon/internal/metrics"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/image"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/events"
        imagetypes "github.com/moby/moby/api/types/image"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

type conflictType int

const (
        conflictDependentChild conflictType = 1 << iota
        conflictRunningContainer
        conflictActiveReference
        conflictStoppedContainer
        conflictHard = conflictDependentChild | conflictRunningContainer
        conflictSoft = conflictActiveReference | conflictStoppedContainer
)

// ImageDelete deletes the image referenced by the given imageRef from this
// daemon. The given imageRef can be an image ID, ID prefix, or a repository
// reference (with an optional tag or digest, defaulting to the tag name
// "latest"). There is differing behavior depending on whether the given
// imageRef is a repository reference or not.
//
// If the given imageRef is a repository reference then that repository
// reference will be removed. However, if there exists any containers which
// were created using the same image reference then the repository reference
// cannot be removed unless either there are other repository references to the
// same image or options.Force is true. Following removal of the repository reference,
// the referenced image itself will attempt to be deleted as described below
// but quietly, meaning any image delete conflicts will cause the image to not
// be deleted and the conflict will not be reported.
//
// There may be conflicts preventing deletion of an image and these conflicts
// are divided into two categories grouped by their severity:
//
// Hard Conflict:
//   - a pull or build using the image.
//   - any descendant image.
//   - any running container using the image.
//
// Soft Conflict:
//   - any stopped container using the image.
//   - any repository tag or digest references to the image.
//
// The image cannot be removed if there are any hard conflicts and can be
// removed if there are soft conflicts only if options.Force is true.
//
// If options.PruneChildren is true, ancestor images are attempted to be deleted quietly,
// meaning any delete conflicts will cause the image to not be deleted and the
// conflict will not be reported.
func (i *ImageService) ImageDelete(ctx context.Context, imageRef string, options imagetypes.RemoveOptions) ([]imagetypes.DeleteResponse, error) {
        start := time.Now()
        records := []imagetypes.DeleteResponse{}

        var platform *ocispec.Platform
        switch len(options.Platforms) {
        case 0:
        case 1:
                platform = &options.Platforms[0]
        default:
                return nil, errdefs.InvalidParameter(errors.New("multiple platforms are not supported"))
        }

        img, err := i.GetImage(ctx, imageRef, backend.GetImageOpts{Platform: platform})
        if err != nil {
                return nil, err
        }

        imgID := img.ID()
        repoRefs := i.referenceStore.References(imgID.Digest())

        using := func(c *container.Container) bool {
                if c.ImageID == imgID {
                        return true
                }

                for _, mp := range c.MountPoints {
                        if mp.Type == "image" {
                                if mp.Spec.Source == string(imgID) {
                                        return true
                                }
                        }
                }

                return false
        }

        force := options.Force
        prune := options.PruneChildren
        var removedRepositoryRef bool
        if !isImageIDPrefix(imgID.String(), imageRef) {
                // A repository reference was given and should be removed
                // first. We can only remove this reference if either force is
                // true, there are multiple repository references to this
                // image, or there are no containers using the given reference.
                if !force && isSingleReference(repoRefs) {
                        if ctr := i.containers.First(using); ctr != nil {
                                // If we removed the repository reference then
                                // this image would remain "dangling" and since
                                // we really want to avoid that the client must
                                // explicitly force its removal.
                                err := errors.Errorf("conflict: unable to remove repository reference %q (must force) - container %s is using its referenced image %s", imageRef, stringid.TruncateID(ctr.ID), stringid.TruncateID(imgID.String()))
                                return nil, errdefs.Conflict(err)
                        }
                }

                parsedRef, err := reference.ParseNormalizedNamed(imageRef)
                if err != nil {
                        return nil, err
                }

                parsedRef, err = i.removeImageRef(parsedRef)
                if err != nil {
                        return nil, err
                }

                untaggedRecord := imagetypes.DeleteResponse{Untagged: reference.FamiliarString(parsedRef)}

                i.LogImageEvent(ctx, imgID.String(), imgID.String(), events.ActionUnTag)
                records = append(records, untaggedRecord)

                repoRefs = i.referenceStore.References(imgID.Digest())

                // If a tag reference was removed and the only remaining
                // references to the same repository are digest references,
                // then clean up those digest references.
                if _, isCanonical := parsedRef.(reference.Canonical); !isCanonical {
                        foundRepoTagRef := false
                        for _, repoRef := range repoRefs {
                                if _, repoRefIsCanonical := repoRef.(reference.Canonical); !repoRefIsCanonical && parsedRef.Name() == repoRef.Name() {
                                        foundRepoTagRef = true
                                        break
                                }
                        }
                        if !foundRepoTagRef {
                                // Remove canonical references from same repository
                                var remainingRefs []reference.Named
                                for _, repoRef := range repoRefs {
                                        if _, repoRefIsCanonical := repoRef.(reference.Canonical); repoRefIsCanonical && parsedRef.Name() == repoRef.Name() {
                                                if _, err := i.removeImageRef(repoRef); err != nil {
                                                        return records, err
                                                }
                                                records = append(records, imagetypes.DeleteResponse{Untagged: reference.FamiliarString(repoRef)})
                                        } else {
                                                remainingRefs = append(remainingRefs, repoRef)
                                        }
                                }
                                repoRefs = remainingRefs
                        }
                }

                // If it has remaining references then the untag finished the remove
                if len(repoRefs) > 0 {
                        return records, nil
                }

                removedRepositoryRef = true
        } else {
                // If an ID reference was given AND there is at most one tag
                // reference to the image AND all references are within one
                // repository, then remove all references.
                if isSingleReference(repoRefs) {
                        c := conflictHard
                        if !force {
                                c |= conflictSoft &^ conflictActiveReference
                        }
                        if conflict := i.checkImageDeleteConflict(imgID, c); conflict != nil {
                                return nil, conflict
                        }

                        for _, repoRef := range repoRefs {
                                parsedRef, err := i.removeImageRef(repoRef)
                                if err != nil {
                                        return nil, err
                                }
                                i.LogImageEvent(ctx, imgID.String(), imgID.String(), events.ActionUnTag)
                                records = append(records, imagetypes.DeleteResponse{Untagged: reference.FamiliarString(parsedRef)})
                        }
                }
        }

        if err := i.imageDeleteHelper(imgID, &records, force, prune, removedRepositoryRef); err != nil {
                return nil, err
        }

        metrics.ImageActions.WithValues("delete").UpdateSince(start)

        return records, nil
}

// isSingleReference returns true when all references are from one repository
// and there is at most one tag. Returns false for empty input.
func isSingleReference(repoRefs []reference.Named) bool {
        if len(repoRefs) <= 1 {
                return len(repoRefs) == 1
        }
        var singleRef reference.Named
        canonicalRefs := map[string]struct{}{}
        for _, repoRef := range repoRefs {
                if _, isCanonical := repoRef.(reference.Canonical); isCanonical {
                        canonicalRefs[repoRef.Name()] = struct{}{}
                } else if singleRef == nil {
                        singleRef = repoRef
                } else {
                        return false
                }
        }
        if singleRef == nil {
                // Just use first canonical ref
                singleRef = repoRefs[0]
        }
        _, ok := canonicalRefs[singleRef.Name()]
        return len(canonicalRefs) == 1 && ok
}

// isImageIDPrefix returns whether the given possiblePrefix is a prefix of the
// given imageID.
func isImageIDPrefix(imageID, possiblePrefix string) bool {
        if strings.HasPrefix(imageID, possiblePrefix) {
                return true
        }

        if i := strings.IndexRune(imageID, ':'); i >= 0 {
                return strings.HasPrefix(imageID[i+1:], possiblePrefix)
        }

        return false
}

// removeImageRef attempts to parse and remove the given image reference from
// this daemon's store of repository tag/digest references. The given
// repositoryRef must not be an image ID but a repository name followed by an
// optional tag or digest reference. If tag or digest is omitted, the default
// tag is used. Returns the resolved image reference and an error.
func (i *ImageService) removeImageRef(ref reference.Named) (reference.Named, error) {
        ref = reference.TagNameOnly(ref)

        // Ignore the boolean value returned, as far as we're concerned, this
        // is an idempotent operation and it's okay if the reference didn't
        // exist in the first place.
        _, err := i.referenceStore.Delete(ref)

        return ref, err
}

// removeAllReferencesToImageID attempts to remove every reference to the given
// imgID from this daemon's store of repository tag/digest references. Returns
// on the first encountered error. Removed references are logged to this
// daemon's event service. An "Untagged" types.ImageDeleteResponseItem is added to the
// given list of records.
func (i *ImageService) removeAllReferencesToImageID(imgID image.ID, records *[]imagetypes.DeleteResponse) error {
        for _, imageRef := range i.referenceStore.References(imgID.Digest()) {
                parsedRef, err := i.removeImageRef(imageRef)
                if err != nil {
                        return err
                }
                i.LogImageEvent(context.TODO(), imgID.String(), imgID.String(), events.ActionUnTag)
                *records = append(*records, imagetypes.DeleteResponse{
                        Untagged: reference.FamiliarString(parsedRef),
                })
        }

        return nil
}

// ImageDeleteConflict holds a soft or hard conflict and an associated error.
// Implements the error interface.
type imageDeleteConflict struct {
        hard    bool
        used    bool
        imgID   image.ID
        message string
}

func (idc *imageDeleteConflict) Error() string {
        var forceMsg string
        if idc.hard {
                forceMsg = "cannot be forced"
        } else {
                forceMsg = "must be forced"
        }

        return fmt.Sprintf("conflict: unable to delete %s (%s) - %s", stringid.TruncateID(idc.imgID.String()), forceMsg, idc.message)
}

func (idc *imageDeleteConflict) Conflict() {}

// imageDeleteHelper attempts to delete the given image from this daemon. If
// the image has any hard delete conflicts (child images or running containers
// using the image) then it cannot be deleted. If the image has any soft delete
// conflicts (any tags/digests referencing the image or any stopped container
// using the image) then it can only be deleted if force is true. If the delete
// succeeds and prune is true, the parent images are also deleted if they do
// not have any soft or hard delete conflicts themselves. Any deleted images
// and untagged references are appended to the given records. If any error or
// conflict is encountered, it will be returned immediately without deleting
// the image. If quiet is true, any encountered conflicts will be ignored and
// the function will return nil immediately without deleting the image.
func (i *ImageService) imageDeleteHelper(imgID image.ID, records *[]imagetypes.DeleteResponse, force, prune, quiet bool) error {
        // First, determine if this image has any conflicts. Ignore soft conflicts
        // if force is true.
        c := conflictHard
        if !force {
                c |= conflictSoft
        }
        if conflict := i.checkImageDeleteConflict(imgID, c); conflict != nil {
                if quiet && (!i.imageIsDangling(imgID) || conflict.used) {
                        // Ignore conflicts UNLESS the image is "dangling" or not being used in
                        // which case we want the user to know.
                        return nil
                }

                // There was a conflict and it's either a hard conflict OR we are not
                // forcing deletion on soft conflicts.
                return conflict
        }

        parent, err := i.imageStore.GetParent(imgID)
        if err != nil {
                // There may be no parent
                parent = ""
        }

        // Delete all repository tag/digest references to this image.
        if err := i.removeAllReferencesToImageID(imgID, records); err != nil {
                return err
        }

        removedLayers, err := i.imageStore.Delete(imgID)
        if err != nil {
                return err
        }

        i.LogImageEvent(context.TODO(), imgID.String(), imgID.String(), events.ActionDelete)
        *records = append(*records, imagetypes.DeleteResponse{Deleted: imgID.String()})
        for _, removedLayer := range removedLayers {
                *records = append(*records, imagetypes.DeleteResponse{Deleted: removedLayer.ChainID.String()})
        }

        if !prune || parent == "" {
                return nil
        }

        // We need to prune the parent image. This means delete it if there are
        // no tags/digests referencing it and there are no containers using it (
        // either running or stopped).
        // Do not force prunings, but do so quietly (stopping on any encountered
        // conflicts).
        return i.imageDeleteHelper(parent, records, false, true, true)
}

// checkImageDeleteConflict determines whether there are any conflicts
// preventing deletion of the given image from this daemon. A hard conflict is
// any image which has the given image as a parent or any running container
// using the image. A soft conflict is any tags/digest referencing the given
// image or any stopped container using the image. If ignoreSoftConflicts is
// true, this function will not check for soft conflict conditions.
func (i *ImageService) checkImageDeleteConflict(imgID image.ID, mask conflictType) *imageDeleteConflict {
        // Check if the image has any descendant images.
        if mask&conflictDependentChild != 0 && len(i.imageStore.Children(imgID)) > 0 {
                return &imageDeleteConflict{
                        hard:    true,
                        imgID:   imgID,
                        message: "image has dependent child images",
                }
        }

        if mask&conflictRunningContainer != 0 {
                // Check if any running container is using the image.
                running := func(c *container.Container) bool {
                        return c.ImageID == imgID && c.IsRunning()
                }
                if ctr := i.containers.First(running); ctr != nil {
                        return &imageDeleteConflict{
                                imgID:   imgID,
                                hard:    true,
                                used:    true,
                                message: fmt.Sprintf("image is being used by running container %s", stringid.TruncateID(ctr.ID)),
                        }
                }
        }

        // Check if any repository tags/digest reference this image.
        if mask&conflictActiveReference != 0 && len(i.referenceStore.References(imgID.Digest())) > 0 {
                return &imageDeleteConflict{
                        imgID:   imgID,
                        message: "image is referenced in multiple repositories",
                }
        }

        if mask&conflictStoppedContainer != 0 {
                // Check if any stopped containers reference this image.
                stopped := func(c *container.Container) bool {
                        return !c.IsRunning() && c.ImageID == imgID
                }
                if ctr := i.containers.First(stopped); ctr != nil {
                        return &imageDeleteConflict{
                                imgID:   imgID,
                                used:    true,
                                message: fmt.Sprintf("image is being used by stopped container %s", stringid.TruncateID(ctr.ID)),
                        }
                }
        }

        return nil
}

// imageIsDangling returns whether the given image is "dangling" which means
// that there are no repository references to the given image and it has no
// child images.
func (i *ImageService) imageIsDangling(imgID image.ID) bool {
        return len(i.referenceStore.References(imgID.Digest())) == 0 && len(i.imageStore.Children(imgID)) == 0
}

package images

import (
        "context"

        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/events"
)

// LogImageEvent generates an event related to an image with only the default attributes.
func (i *ImageService) LogImageEvent(ctx context.Context, imageID, refName string, action events.Action) {
        ctx = context.WithoutCancel(ctx)
        attributes := map[string]string{}

        img, err := i.GetImage(ctx, imageID, backend.GetImageOpts{})
        if err == nil && img.Config != nil {
                // image has not been removed yet.
                // it could be missing if the event is `delete`.
                copyAttributes(attributes, img.Config.Labels)
        }
        if refName != "" {
                attributes["name"] = refName
        }
        i.eventsService.Log(action, events.ImageEventType, events.Actor{
                ID:         imageID,
                Attributes: attributes,
        })
}

// copyAttributes guarantees that labels are not mutated by event triggers.
func copyAttributes(attributes, labels map[string]string) {
        if labels == nil {
                return
        }
        for k, v := range labels {
                attributes[k] = v
        }
}

package images

import (
        "context"
        "io"

        "github.com/docker/docker/image/tarexport"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// ExportImage exports a list of images to the given output stream. The
// exported images are archived into a tar when written to the output
// stream. All images with the given tag and all versions containing
// the same tag are exported. names is the set of tags to export, and
// outStream is the writer which the images are written to.
func (i *ImageService) ExportImage(ctx context.Context, names []string, platform *ocispec.Platform, outStream io.Writer) error {
        imageExporter := tarexport.NewTarExporter(i.imageStore, i.layerStore, i.referenceStore, i, platform)
        return imageExporter.Save(ctx, names, outStream)
}

// LoadImage uploads a set of images into the repository. This is the
// complement of ExportImage.  The input stream is an uncompressed tar
// ball containing images and metadata.
func (i *ImageService) LoadImage(ctx context.Context, inTar io.ReadCloser, platform *ocispec.Platform, outStream io.Writer, quiet bool) error {
        imageExporter := tarexport.NewTarExporter(i.imageStore, i.layerStore, i.referenceStore, i, platform)
        return imageExporter.Load(ctx, inTar, outStream, quiet)
}

package images

import (
        "context"
        "errors"
        "time"

        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/internal/metrics"
        "github.com/docker/docker/layer"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/image"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// ImageHistory returns a slice of ImageHistory structures for the specified image
// name by walking the image lineage.
func (i *ImageService) ImageHistory(ctx context.Context, name string, platform *ocispec.Platform) ([]*image.HistoryResponseItem, error) {
        start := time.Now()
        img, err := i.GetImage(ctx, name, backend.GetImageOpts{Platform: platform})
        if err != nil {
                return nil, err
        }

        history := []*image.HistoryResponseItem{}

        layerCounter := 0
        rootFS := *img.RootFS
        rootFS.DiffIDs = nil

        for _, h := range img.History {
                var layerSize int64

                if !h.EmptyLayer {
                        if len(img.RootFS.DiffIDs) <= layerCounter {
                                return nil, errors.New("too many non-empty layers in History section")
                        }
                        rootFS.Append(img.RootFS.DiffIDs[layerCounter])
                        l, err := i.layerStore.Get(rootFS.ChainID())
                        if err != nil {
                                return nil, err
                        }
                        layerSize = l.DiffSize()
                        layer.ReleaseAndLog(i.layerStore, l)
                        layerCounter++
                }

                var created int64
                if h.Created != nil {
                        created = h.Created.Unix()
                }

                history = append([]*image.HistoryResponseItem{{
                        ID:        "<missing>",
                        Created:   created,
                        CreatedBy: h.CreatedBy,
                        Comment:   h.Comment,
                        Size:      layerSize,
                }}, history...)
        }

        // Fill in image IDs and tags
        histImg := img
        id := img.ID()
        for _, h := range history {
                h.ID = id.String()

                var tags []string
                for _, r := range i.referenceStore.References(id.Digest()) {
                        if _, ok := r.(reference.NamedTagged); ok {
                                tags = append(tags, reference.FamiliarString(r))
                        }
                }

                h.Tags = tags

                id = histImg.Parent
                if id == "" {
                        break
                }
                histImg, err = i.GetImage(ctx, id.String(), backend.GetImageOpts{})
                if err != nil {
                        break
                }
        }
        metrics.ImageActions.WithValues("history").UpdateSince(start)
        return history, nil
}

package images

import (
        "context"
        "encoding/json"
        "io"
        "time"

        "github.com/containerd/platforms"
        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/builder/dockerfile"
        "github.com/docker/docker/dockerversion"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/moby/go-archive/compression"
        "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/events"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// ImportImage imports an image, getting the archived layer data from layerReader.
// Uncompressed layer archive is passed to the layerStore and handled by the
// underlying graph driver.
// Image is tagged with the given reference.
// If the platform is nil, the default host platform is used.
// Message is used as the image's history comment.
// Image configuration is derived from the dockerfile instructions in changes.
func (i *ImageService) ImportImage(ctx context.Context, newRef reference.Named, platform *ocispec.Platform, msg string, layerReader io.Reader, changes []string) (image.ID, error) {
        if platform == nil {
                def := platforms.DefaultSpec()
                platform = &def
        }
        if err := image.CheckOS(platform.OS); err != nil {
                return "", err
        }

        config, err := dockerfile.BuildFromConfig(ctx, &container.Config{}, changes, platform.OS)
        if err != nil {
                return "", errdefs.InvalidParameter(err)
        }

        inflatedLayerData, err := compression.DecompressStream(layerReader)
        if err != nil {
                return "", err
        }
        l, err := i.layerStore.Register(inflatedLayerData, "")
        if err != nil {
                return "", err
        }
        defer layer.ReleaseAndLog(i.layerStore, l)

        created := time.Now().UTC()
        imgConfig, err := json.Marshal(&image.Image{
                V1Image: image.V1Image{
                        DockerVersion: dockerversion.Version,
                        Config:        config,
                        Architecture:  platform.Architecture,
                        Variant:       platform.Variant,
                        OS:            platform.OS,
                        Created:       &created,
                        Comment:       msg,
                },
                RootFS: &image.RootFS{
                        Type:    "layers",
                        DiffIDs: []layer.DiffID{l.DiffID()},
                },
                History: []image.History{{
                        Created: &created,
                        Comment: msg,
                }},
        })
        if err != nil {
                return "", err
        }

        id, err := i.imageStore.Create(imgConfig)
        if err != nil {
                return "", err
        }

        if newRef != nil {
                if err := i.TagImage(ctx, id, newRef); err != nil {
                        return "", err
                }
        }

        i.LogImageEvent(ctx, id.String(), id.String(), events.ActionImport)
        return id, nil
}

package images

import (
        "context"
        "time"

        "github.com/distribution/reference"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/moby/moby/api/types/backend"
        imagetypes "github.com/moby/moby/api/types/image"
        "github.com/moby/moby/api/types/storage"
)

func (i *ImageService) ImageInspect(ctx context.Context, refOrID string, opts backend.ImageInspectOpts) (*imagetypes.InspectResponse, error) {
        img, err := i.GetImage(ctx, refOrID, backend.GetImageOpts{Platform: opts.Platform})
        if err != nil {
                return nil, err
        }

        size, layerMetadata, err := i.getLayerSizeAndMetadata(img)
        if err != nil {
                return nil, err
        }

        lastUpdated, err := i.imageStore.GetLastUpdated(img.ID())
        if err != nil {
                return nil, err
        }

        var repoTags, repoDigests []string
        for _, ref := range i.referenceStore.References(img.ID().Digest()) {
                switch ref.(type) {
                case reference.NamedTagged:
                        repoTags = append(repoTags, reference.FamiliarString(ref))
                case reference.Canonical:
                        repoDigests = append(repoDigests, reference.FamiliarString(ref))
                }
        }

        comment := img.Comment
        if comment == "" && len(img.History) > 0 {
                comment = img.History[len(img.History)-1].Comment
        }

        var created string
        if img.Created != nil {
                created = img.Created.Format(time.RFC3339Nano)
        }

        var layers []string
        for _, l := range img.RootFS.DiffIDs {
                layers = append(layers, l.String())
        }

        imgConfig := containerConfigToDockerOCIImageConfig(img.Config)
        return &imagetypes.InspectResponse{
                ID:              img.ID().String(),
                RepoTags:        repoTags,
                RepoDigests:     repoDigests,
                Parent:          img.Parent.String(),
                Comment:         comment,
                Created:         created,
                Container:       img.Container,        //nolint:staticcheck // ignore SA1019: field is deprecated, but still set on API < v1.45.
                ContainerConfig: &img.ContainerConfig, //nolint:staticcheck // ignore SA1019: field is deprecated, but still set on API < v1.45.
                DockerVersion:   img.DockerVersion,
                Author:          img.Author,
                Config:          &imgConfig,
                Architecture:    img.Architecture,
                Variant:         img.Variant,
                Os:              img.OperatingSystem(),
                OsVersion:       img.OSVersion,
                Size:            size,
                GraphDriver: storage.DriverData{
                        Name: i.layerStore.DriverName(),
                        Data: layerMetadata,
                },
                RootFS: imagetypes.RootFS{
                        Type:   img.RootFS.Type,
                        Layers: layers,
                },
                Metadata: imagetypes.Metadata{
                        LastTagTime: lastUpdated,
                },
        }, nil
}

func (i *ImageService) getLayerSizeAndMetadata(img *image.Image) (int64, map[string]string, error) {
        var size int64
        var layerMetadata map[string]string
        layerID := img.RootFS.ChainID()
        if layerID != "" {
                l, err := i.layerStore.Get(layerID)
                if err != nil {
                        return 0, nil, err
                }
                defer layer.ReleaseAndLog(i.layerStore, l)
                size = l.Size()
                layerMetadata, err = l.Metadata()
                if err != nil {
                        return 0, nil, err
                }
        }
        return size, layerMetadata, nil
}

package images

import (
        "context"
        "errors"
        "fmt"
        "sort"
        "time"

        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/container"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/moby/moby/api/types/backend"
        imagetypes "github.com/moby/moby/api/types/image"
        timetypes "github.com/moby/moby/api/types/time"
)

var acceptedImageFilterTags = map[string]bool{
        "dangling":  true,
        "label":     true,
        "before":    true,
        "since":     true,
        "reference": true,
        "until":     true,
}

// byCreated is a temporary type used to sort a list of images by creation
// time.
type byCreated []*imagetypes.Summary

func (r byCreated) Len() int           { return len(r) }
func (r byCreated) Swap(i, j int)      { r[i], r[j] = r[j], r[i] }
func (r byCreated) Less(i, j int) bool { return r[i].Created < r[j].Created }

// Images returns a filtered list of images.
func (i *ImageService) Images(ctx context.Context, opts imagetypes.ListOptions) ([]*imagetypes.Summary, error) {
        if err := opts.Filters.Validate(acceptedImageFilterTags); err != nil {
                return nil, err
        }

        danglingOnly, err := opts.Filters.GetBoolOrDefault("dangling", false)
        if err != nil {
                return nil, err
        }

        var beforeFilter, sinceFilter time.Time
        err = opts.Filters.WalkValues("before", func(value string) error {
                img, err := i.GetImage(ctx, value, backend.GetImageOpts{})
                if err != nil {
                        return err
                }
                // Resolve multiple values to the oldest image,
                // equivalent to ANDing all the values together.
                if img.Created != nil && (beforeFilter.IsZero() || beforeFilter.After(*img.Created)) {
                        beforeFilter = *img.Created
                }
                return nil
        })
        if err != nil {
                return nil, err
        }

        err = opts.Filters.WalkValues("until", func(value string) error {
                ts, err := timetypes.GetTimestamp(value, time.Now())
                if err != nil {
                        return err
                }
                seconds, nanoseconds, err := timetypes.ParseTimestamps(ts, 0)
                if err != nil {
                        return err
                }
                timestamp := time.Unix(seconds, nanoseconds)
                if beforeFilter.IsZero() || beforeFilter.After(timestamp) {
                        beforeFilter = timestamp
                }
                return nil
        })
        if err != nil {
                return nil, err
        }

        err = opts.Filters.WalkValues("since", func(value string) error {
                img, err := i.GetImage(ctx, value, backend.GetImageOpts{})
                if err != nil {
                        return err
                }
                // Resolve multiple values to the newest image,
                // equivalent to ANDing all the values together.
                if img.Created != nil && sinceFilter.Before(*img.Created) {
                        sinceFilter = *img.Created
                }
                return nil
        })
        if err != nil {
                return nil, err
        }

        var selectedImages map[image.ID]*image.Image
        if danglingOnly {
                selectedImages = i.imageStore.Heads()
        } else {
                selectedImages = i.imageStore.Map()
        }

        var (
                summaries     = make([]*imagetypes.Summary, 0, len(selectedImages))
                summaryMap    = make(map[*image.Image]*imagetypes.Summary, len(selectedImages))
                allContainers []*container.Container
        )
        for id, img := range selectedImages {
                select {
                case <-ctx.Done():
                        return nil, ctx.Err()
                default:
                }

                if !beforeFilter.IsZero() && (img.Created == nil || !img.Created.Before(beforeFilter)) {
                        continue
                }
                if !sinceFilter.IsZero() && (img.Created == nil || !img.Created.After(sinceFilter)) {
                        continue
                }

                if opts.Filters.Contains("label") {
                        // Very old image that do not have image.Config (or even labels)
                        if img.Config == nil {
                                continue
                        }
                        // We are now sure image.Config is not nil
                        if !opts.Filters.MatchKVList("label", img.Config.Labels) {
                                continue
                        }
                }

                // Skip any images with an unsupported operating system to avoid a potential
                // panic when indexing through the layerstore. Don't error as we want to list
                // the other images. This should never happen, but here as a safety precaution.
                if err := image.CheckOS(img.OperatingSystem()); err != nil {
                        continue
                }

                var size int64
                if layerID := img.RootFS.ChainID(); layerID != "" {
                        l, err := i.layerStore.Get(layerID)
                        if err != nil {
                                // The layer may have been deleted between the call to `Map()` or
                                // `Heads()` and the call to `Get()`, so we just ignore this error
                                if errors.Is(err, layer.ErrLayerDoesNotExist) {
                                        continue
                                }
                                return nil, err
                        }

                        size = l.Size()
                        layer.ReleaseAndLog(i.layerStore, l)
                }

                summary := newImageSummary(img, size)

                for _, ref := range i.referenceStore.References(id.Digest()) {
                        if opts.Filters.Contains("reference") {
                                var found bool
                                var matchErr error
                                for _, pattern := range opts.Filters.Get("reference") {
                                        found, matchErr = reference.FamiliarMatch(pattern, ref)
                                        if matchErr != nil {
                                                return nil, matchErr
                                        }
                                        if found {
                                                break
                                        }
                                }
                                if !found {
                                        continue
                                }
                        }
                        if _, ok := ref.(reference.Canonical); ok {
                                summary.RepoDigests = append(summary.RepoDigests, reference.FamiliarString(ref))
                        }
                        if _, ok := ref.(reference.NamedTagged); ok {
                                summary.RepoTags = append(summary.RepoTags, reference.FamiliarString(ref))
                        }
                }
                if summary.RepoDigests == nil && summary.RepoTags == nil {
                        if opts.All || len(i.imageStore.Children(id)) == 0 {
                                if opts.Filters.Contains("dangling") && !danglingOnly {
                                        // dangling=false case, so dangling image is not needed
                                        continue
                                }
                                if opts.Filters.Contains("reference") { // skip images with no references if filtering by reference
                                        continue
                                }
                        } else {
                                continue
                        }
                } else if danglingOnly && len(summary.RepoTags) > 0 {
                        continue
                }

                // Lazily init allContainers.
                if allContainers == nil {
                        allContainers = i.containers.List()
                }

                // Get container count
                var containersCount int64
                for _, c := range allContainers {
                        if c.ImageID == id {
                                containersCount++
                        }
                }
                summary.Containers = containersCount
                summaryMap[img] = summary
                summaries = append(summaries, summary)
        }

        if opts.SharedSize {
                allLayers := i.layerStore.Map()
                layerRefs := make(map[layer.ChainID]int, len(allLayers))

                allImages := selectedImages
                if danglingOnly {
                        // If danglingOnly is true, then selectedImages include only dangling images,
                        // but we need to consider all existing images to correctly perform reference counting.
                        // If danglingOnly is false, selectedImages (and, hence, allImages) is already equal to i.imageStore.Map()
                        // and we can avoid performing an otherwise redundant method call.
                        allImages = i.imageStore.Map()
                }
                // Count layer references across all known images
                for _, img := range allImages {
                        rootFS := *img.RootFS
                        rootFS.DiffIDs = nil
                        for _, id := range img.RootFS.DiffIDs {
                                rootFS.Append(id)
                                layerRefs[rootFS.ChainID()]++
                        }
                }

                // Get Shared sizes
                for img, summary := range summaryMap {
                        rootFS := *img.RootFS
                        rootFS.DiffIDs = nil

                        // Indicate that we collected shared size information (default is -1, or "not set")
                        summary.SharedSize = 0
                        for _, id := range img.RootFS.DiffIDs {
                                rootFS.Append(id)
                                chid := rootFS.ChainID()

                                if layerRefs[chid] > 1 {
                                        if _, ok := allLayers[chid]; !ok {
                                                return nil, fmt.Errorf("layer %v was not found (corruption?)", chid)
                                        }
                                        summary.SharedSize += allLayers[chid].DiffSize()
                                }
                        }
                }
        }

        sort.Sort(sort.Reverse(byCreated(summaries)))

        return summaries, nil
}

func newImageSummary(image *image.Image, size int64) *imagetypes.Summary {
        var created int64
        if image.Created != nil {
                created = image.Created.Unix()
        }
        summary := &imagetypes.Summary{
                ParentID: image.Parent.String(),
                ID:       image.ID().String(),
                Created:  created,
                Size:     size,
                // -1 indicates that the value has not been set (avoids ambiguity
                // between 0 (default) and "not set". We cannot use a pointer (nil)
                // for this, as the JSON representation uses "omitempty", which would
                // consider both "0" and "nil" to be "empty".
                SharedSize: -1,
                Containers: -1,
        }
        if image.Config != nil {
                summary.Labels = image.Config.Labels
        }
        return summary
}

package images

import (
        "context"
        "strconv"
        "time"

        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/moby/moby/api/types/events"
        "github.com/moby/moby/api/types/filters"
        imagetypes "github.com/moby/moby/api/types/image"
        timetypes "github.com/moby/moby/api/types/time"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

var imagesAcceptedFilters = map[string]bool{
        "dangling": true,
        "label":    true,
        "label!":   true,
        "until":    true,
}

// errPruneRunning is returned when a prune request is received while
// one is in progress
var errPruneRunning = errdefs.Conflict(errors.New("a prune operation is already running"))

// ImagesPrune removes unused images
func (i *ImageService) ImagesPrune(ctx context.Context, pruneFilters filters.Args) (*imagetypes.PruneReport, error) {
        if !i.pruneRunning.CompareAndSwap(false, true) {
                return nil, errPruneRunning
        }
        defer i.pruneRunning.Store(false)

        // make sure that only accepted filters have been received
        err := pruneFilters.Validate(imagesAcceptedFilters)
        if err != nil {
                return nil, err
        }

        rep := &imagetypes.PruneReport{}

        danglingOnly, err := pruneFilters.GetBoolOrDefault("dangling", true)
        if err != nil {
                return nil, err
        }

        until, err := getUntilFromPruneFilters(pruneFilters)
        if err != nil {
                return nil, err
        }

        var allImages map[image.ID]*image.Image
        if danglingOnly {
                allImages = i.imageStore.Heads()
        } else {
                allImages = i.imageStore.Map()
        }

        // Filter intermediary images and get their unique size
        allLayers := i.layerStore.Map()
        topImages := map[image.ID]*image.Image{}
        for id, img := range allImages {
                select {
                case <-ctx.Done():
                        return nil, ctx.Err()
                default:
                        dgst := digest.Digest(id)
                        if len(i.referenceStore.References(dgst)) == 0 && len(i.imageStore.Children(id)) != 0 {
                                continue
                        }
                        if !until.IsZero() && (img.Created == nil || img.Created.After(until)) {
                                continue
                        }
                        if img.Config != nil && !matchLabels(pruneFilters, img.Config.Labels) {
                                continue
                        }
                        topImages[id] = img
                }
        }

        canceled := false
deleteImagesLoop:
        for id := range topImages {
                select {
                case <-ctx.Done():
                        // we still want to calculate freed size and return the data
                        canceled = true
                        break deleteImagesLoop
                default:
                }

                deletedImages := []imagetypes.DeleteResponse{}
                refs := i.referenceStore.References(id.Digest())
                if len(refs) > 0 {
                        shouldDelete := !danglingOnly
                        if !shouldDelete {
                                hasTag := false
                                for _, ref := range refs {
                                        if _, ok := ref.(reference.NamedTagged); ok {
                                                hasTag = true
                                                break
                                        }
                                }

                                // Only delete if it has no references which is a valid NamedTagged.
                                shouldDelete = !hasTag
                        }

                        if shouldDelete {
                                for _, ref := range refs {
                                        imgDel, err := i.ImageDelete(ctx, ref.String(), imagetypes.RemoveOptions{
                                                PruneChildren: true,
                                        })
                                        if imageDeleteFailed(ref.String(), err) {
                                                continue
                                        }
                                        deletedImages = append(deletedImages, imgDel...)
                                }
                        }
                } else {
                        hex := id.Digest().Encoded()
                        imgDel, err := i.ImageDelete(ctx, hex, imagetypes.RemoveOptions{
                                PruneChildren: true,
                        })
                        if imageDeleteFailed(hex, err) {
                                continue
                        }
                        deletedImages = append(deletedImages, imgDel...)
                }

                rep.ImagesDeleted = append(rep.ImagesDeleted, deletedImages...)
        }

        // Compute how much space was freed
        for _, d := range rep.ImagesDeleted {
                if d.Deleted != "" {
                        if l, ok := allLayers[layer.ChainID(d.Deleted)]; ok {
                                rep.SpaceReclaimed += uint64(l.DiffSize())
                        }
                }
        }

        if canceled {
                log.G(ctx).Debugf("ImagesPrune operation cancelled: %#v", *rep)
        }
        i.eventsService.Log(events.ActionPrune, events.ImageEventType, events.Actor{
                Attributes: map[string]string{
                        "reclaimed": strconv.FormatUint(rep.SpaceReclaimed, 10),
                },
        })
        return rep, nil
}

func imageDeleteFailed(ref string, err error) bool {
        switch {
        case err == nil:
                return false
        case cerrdefs.IsConflict(err), errors.Is(err, context.Canceled), errors.Is(err, context.DeadlineExceeded):
                return true
        default:
                log.G(context.TODO()).Warnf("failed to prune image %s: %v", ref, err)
                return true
        }
}

func matchLabels(pruneFilters filters.Args, labels map[string]string) bool {
        if !pruneFilters.MatchKVList("label", labels) {
                return false
        }
        // By default MatchKVList will return true if field (like 'label!') does not exist
        // So we have to add additional Contains("label!") check
        if pruneFilters.Contains("label!") {
                if pruneFilters.MatchKVList("label!", labels) {
                        return false
                }
        }
        return true
}

func getUntilFromPruneFilters(pruneFilters filters.Args) (time.Time, error) {
        until := time.Time{}
        if !pruneFilters.Contains("until") {
                return until, nil
        }
        untilFilters := pruneFilters.Get("until")
        if len(untilFilters) > 1 {
                return until, errors.New("more than one until filter specified")
        }
        ts, err := timetypes.GetTimestamp(untilFilters[0], time.Now())
        if err != nil {
                return until, err
        }
        seconds, nanoseconds, err := timetypes.ParseTimestamps(ts, 0)
        if err != nil {
                return until, err
        }
        until = time.Unix(seconds, nanoseconds)
        return until, nil
}

package images

import (
        "context"
        "io"
        "time"

        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/internal/metrics"
        "github.com/docker/docker/distribution"
        progressutils "github.com/docker/docker/distribution/utils"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/streamformatter"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/registry"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// PullImage initiates a pull operation. image is the repository name to pull, and
// tag may be either empty, or indicate a specific tag to pull.
func (i *ImageService) PullImage(ctx context.Context, ref reference.Named, platform *ocispec.Platform, metaHeaders map[string][]string, authConfig *registry.AuthConfig, outStream io.Writer) error {
        start := time.Now()

        err := i.pullImageWithReference(ctx, ref, platform, metaHeaders, authConfig, outStream)
        metrics.ImageActions.WithValues("pull").UpdateSince(start)
        if err != nil {
                return err
        }

        if platform != nil {
                // If --platform was specified, check that the image we pulled matches
                // the expected platform. This check is for situations where the image
                // is a single-arch image, in which case (for backward compatibility),
                // we allow the image to have a non-matching architecture. The code
                // below checks for this situation, and returns a warning to the client,
                // as well as logging it to the daemon logs.
                img, err := i.GetImage(ctx, ref.String(), backend.GetImageOpts{Platform: platform})

                // Note that this is a special case where GetImage returns both an image
                // and an error: https://github.com/docker/docker/blob/v20.10.7/daemon/images/image.go#L175-L183
                if cerrdefs.IsNotFound(err) && img != nil {
                        po := streamformatter.NewJSONProgressOutput(outStream, false)
                        progress.Messagef(po, "", `WARNING: %s`, err.Error())
                        log.G(ctx).WithError(err).WithField("image", reference.FamiliarName(ref)).Warn("ignoring platform mismatch on single-arch image")
                } else if err != nil {
                        return err
                }
        }

        return nil
}

func (i *ImageService) pullImageWithReference(ctx context.Context, ref reference.Named, platform *ocispec.Platform, metaHeaders map[string][]string, authConfig *registry.AuthConfig, outStream io.Writer) error {
        // Include a buffer so that slow client connections don't affect
        // transfer performance.
        progressChan := make(chan progress.Progress, 100)

        writesDone := make(chan struct{})

        ctx, cancelFunc := context.WithCancel(ctx)

        go func() {
                progressutils.WriteDistributionProgress(cancelFunc, outStream, progressChan)
                close(writesDone)
        }()

        ctx = namespaces.WithNamespace(ctx, i.contentNamespace)
        // Take out a temporary lease for everything that gets persisted to the content store.
        // Before the lease is cancelled, any content we want to keep should have it's own lease applied.
        ctx, done, err := tempLease(ctx, i.leases)
        if err != nil {
                return err
        }
        defer done(ctx)

        cs := &contentStoreForPull{
                ContentStore: i.content,
                leases:       i.leases,
        }
        imageStore := &imageStoreForPull{
                ImageConfigStore: distribution.NewImageConfigStoreFromStore(i.imageStore),
                ingested:         cs,
                leases:           i.leases,
        }

        imagePullConfig := &distribution.ImagePullConfig{
                Config: distribution.Config{
                        MetaHeaders:      metaHeaders,
                        AuthConfig:       authConfig,
                        ProgressOutput:   progress.ChanOutput(progressChan),
                        RegistryService:  i.registryService,
                        ImageEventLogger: i.LogImageEvent,
                        MetadataStore:    i.distributionMetadataStore,
                        ImageStore:       imageStore,
                        ReferenceStore:   i.referenceStore,
                },
                DownloadManager: i.downloadManager,
                Platform:        platform,
        }

        err = distribution.Pull(ctx, ref, imagePullConfig, cs)
        close(progressChan)
        <-writesDone
        return err
}

func tempLease(ctx context.Context, mgr leases.Manager) (context.Context, func(context.Context) error, error) {
        nop := func(context.Context) error { return nil }
        _, ok := leases.FromContext(ctx)
        if ok {
                return ctx, nop, nil
        }

        // Use an expiration that ensures the lease is cleaned up at some point if there is a crash, SIGKILL, etc.
        opts := []leases.Opt{
                leases.WithRandomID(),
                leases.WithExpiration(24 * time.Hour),
                leases.WithLabels(map[string]string{
                        "moby.lease/temporary": time.Now().UTC().Format(time.RFC3339Nano),
                }),
        }
        l, err := mgr.Create(ctx, opts...)
        if err != nil {
                return ctx, nop, errors.Wrap(err, "error creating temporary lease")
        }

        ctx = leases.WithLease(ctx, l.ID)
        return ctx, func(ctx context.Context) error {
                return mgr.Delete(ctx, l)
        }, nil
}

package images

import (
        "context"
        "io"
        "time"

        "github.com/distribution/reference"
        "github.com/docker/distribution/manifest/schema2"
        "github.com/docker/docker/daemon/internal/metrics"
        "github.com/docker/docker/distribution"
        progressutils "github.com/docker/docker/distribution/utils"
        "github.com/docker/docker/pkg/progress"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/registry"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// PushImage initiates a push operation on the repository named localName.
func (i *ImageService) PushImage(ctx context.Context, ref reference.Named, platform *ocispec.Platform, metaHeaders map[string][]string, authConfig *registry.AuthConfig, outStream io.Writer) error {
        if platform != nil {
                // Check if the image is actually the platform we want to push.
                _, err := i.GetImage(ctx, ref.String(), backend.GetImageOpts{Platform: platform})
                if err != nil {
                        return err
                }
        }
        start := time.Now()
        // Include a buffer so that slow client connections don't affect
        // transfer performance.
        progressChan := make(chan progress.Progress, 100)

        writesDone := make(chan struct{})

        ctx, cancelFunc := context.WithCancel(ctx)

        go func() {
                progressutils.WriteDistributionProgress(cancelFunc, outStream, progressChan)
                close(writesDone)
        }()

        imagePushConfig := &distribution.ImagePushConfig{
                Config: distribution.Config{
                        MetaHeaders:      metaHeaders,
                        AuthConfig:       authConfig,
                        ProgressOutput:   progress.ChanOutput(progressChan),
                        RegistryService:  i.registryService,
                        ImageEventLogger: i.LogImageEvent,
                        MetadataStore:    i.distributionMetadataStore,
                        ImageStore:       distribution.NewImageConfigStoreFromStore(i.imageStore),
                        ReferenceStore:   i.referenceStore,
                },
                ConfigMediaType: schema2.MediaTypeImageConfig,
                LayerStores:     distribution.NewLayerProvidersFromStore(i.layerStore),
                UploadManager:   i.uploadManager,
        }

        err := distribution.Push(ctx, ref, imagePushConfig)
        close(progressChan)
        <-writesDone
        metrics.ImageActions.WithValues("push").UpdateSince(start)
        return err
}

package images

import (
        "encoding/json"
        "fmt"
        "time"

        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/pkg/errors"
)

// SquashImage creates a new image with the diff of the specified image and the specified parent.
// This new image contains only the layers from it's parent + 1 extra layer which contains the diff of all the layers in between.
// The existing image(s) is not destroyed.
// If no parent is specified, a new image with the diff of all the specified image's layers merged into a new layer that has no parents.
func (i *ImageService) SquashImage(id, parent string) (string, error) {
        var (
                img *image.Image
                err error
        )
        if img, err = i.imageStore.Get(image.ID(id)); err != nil {
                return "", err
        }

        var parentImg *image.Image
        var parentChainID layer.ChainID
        if parent != "" {
                parentImg, err = i.imageStore.Get(image.ID(parent))
                if err != nil {
                        return "", errors.Wrap(err, "error getting specified parent layer")
                }
                parentChainID = parentImg.RootFS.ChainID()
        } else {
                rootFS := image.NewRootFS()
                parentImg = &image.Image{RootFS: rootFS}
        }
        l, err := i.layerStore.Get(img.RootFS.ChainID())
        if err != nil {
                return "", errors.Wrap(err, "error getting image layer")
        }
        defer i.layerStore.Release(l)

        ts, err := l.TarStreamFrom(parentChainID)
        if err != nil {
                return "", errors.Wrapf(err, "error getting tar stream to parent")
        }
        defer ts.Close()

        newL, err := i.layerStore.Register(ts, parentChainID)
        if err != nil {
                return "", errors.Wrap(err, "error registering layer")
        }
        defer i.layerStore.Release(newL)

        newImage := *img
        newImage.RootFS = nil

        rootFS := *parentImg.RootFS
        rootFS.DiffIDs = append(rootFS.DiffIDs, newL.DiffID())
        newImage.RootFS = &rootFS

        for i, hi := range newImage.History {
                if i >= len(parentImg.History) {
                        hi.EmptyLayer = true
                }
                newImage.History[i] = hi
        }

        now := time.Now()
        var historyComment string
        if parent != "" {
                historyComment = fmt.Sprintf("merge %s to %s", id, parent)
        } else {
                historyComment = fmt.Sprintf("create new from %s", id)
        }

        newImage.History = append(newImage.History, image.History{
                Created: &now,
                Comment: historyComment,
        })
        newImage.Created = &now

        b, err := json.Marshal(&newImage)
        if err != nil {
                return "", errors.Wrap(err, "error marshalling image config")
        }

        newImgID, err := i.imageStore.Create(b)
        if err != nil {
                return "", errors.Wrap(err, "error creating new image after squash")
        }
        return string(newImgID), nil
}

package images

import (
        "context"

        "github.com/distribution/reference"
        "github.com/docker/docker/image"
        "github.com/moby/moby/api/types/events"
)

// TagImage adds the given reference to the image ID provided.
func (i *ImageService) TagImage(ctx context.Context, imageID image.ID, newTag reference.Named) error {
        if err := i.referenceStore.AddTag(newTag, imageID.Digest(), true); err != nil {
                return err
        }

        if err := i.imageStore.SetLastUpdated(imageID); err != nil {
                return err
        }
        i.LogImageEvent(ctx, imageID.String(), reference.FamiliarString(newTag), events.ActionTag)
        return nil
}

//go:build linux || freebsd

package images

import (
        "context"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/container"
        "github.com/docker/docker/image"
)

// GetLayerFolders returns the layer folders from an image RootFS
func (i *ImageService) GetLayerFolders(img *image.Image, rwLayer container.RWLayer, containerID string) ([]string, error) {
        // Windows specific
        panic("not implemented")
}

// GetContainerLayerSize returns the real size & virtual size of the container.
func (i *ImageService) GetContainerLayerSize(ctx context.Context, containerID string) (int64, int64, error) {
        var (
                sizeRw, sizeRootfs int64
                err                error
        )

        // Safe to index by runtime.GOOS as Unix hosts don't support multiple
        // container operating systems.
        rwlayer, err := i.layerStore.GetRWLayer(containerID)
        if err != nil {
                log.G(ctx).Errorf("Failed to compute size of container rootfs %v: %v", containerID, err)
                return sizeRw, sizeRootfs, nil
        }
        defer i.layerStore.ReleaseRWLayer(rwlayer)

        sizeRw, err = rwlayer.Size()
        if err != nil {
                log.G(ctx).Errorf("Driver %s couldn't return diff size of container %s: %s",
                        i.layerStore.DriverName(), containerID, err)
                // FIXME: GetSize should return an error. Not changing it now in case
                // there is a side-effect.
                sizeRw = -1
        }

        if parent := rwlayer.Parent(); parent != nil {
                sizeRootfs = parent.Size()
                if sizeRw != -1 {
                        sizeRootfs += sizeRw
                }
        }
        return sizeRw, sizeRootfs, nil
}

package images

import (
        imagespec "github.com/moby/docker-image-spec/specs-go/v1"
        "github.com/moby/moby/api/types/container"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func containerConfigToDockerOCIImageConfig(cfg *container.Config) imagespec.DockerOCIImageConfig {
        var ociCfg ocispec.ImageConfig
        var ext imagespec.DockerOCIImageConfigExt

        if cfg != nil {
                ociCfg = ocispec.ImageConfig{
                        User:        cfg.User,
                        Env:         cfg.Env,
                        Entrypoint:  cfg.Entrypoint,
                        Cmd:         cfg.Cmd,
                        Volumes:     cfg.Volumes,
                        WorkingDir:  cfg.WorkingDir,
                        Labels:      cfg.Labels,
                        StopSignal:  cfg.StopSignal,
                        ArgsEscaped: cfg.ArgsEscaped, //nolint:staticcheck // Ignore SA1019. Need to keep it in image.
                }

                if len(cfg.ExposedPorts) > 0 {
                        ociCfg.ExposedPorts = map[string]struct{}{}
                        for k, v := range cfg.ExposedPorts {
                                ociCfg.ExposedPorts[string(k)] = v
                        }
                }
                ext.Healthcheck = cfg.Healthcheck
                ext.OnBuild = cfg.OnBuild
                ext.Shell = cfg.Shell
        }

        return imagespec.DockerOCIImageConfig{
                ImageConfig:             ociCfg,
                DockerOCIImageConfigExt: ext,
        }
}

package images

import (
        "context"
        "fmt"
        "os"
        "sync/atomic"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/container"
        daemonevents "github.com/docker/docker/daemon/events"
        "github.com/docker/docker/distribution"
        "github.com/docker/docker/distribution/metadata"
        "github.com/docker/docker/distribution/xfer"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        refstore "github.com/docker/docker/reference"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

type containerStore interface {
        // First is used by image delete
        First(container.StoreFilter) *container.Container
        // List is used by image prune, and image list
        List() []*container.Container
        // Get is used by CommitBuildStep
        // TODO: remove, only used for CommitBuildStep
        Get(string) *container.Container
}

// ImageServiceConfig is the configuration used to create a new ImageService
type ImageServiceConfig struct {
        ContainerStore            containerStore
        DistributionMetadataStore metadata.Store
        EventsService             *daemonevents.Events
        ImageStore                image.Store
        LayerStore                layer.Store
        MaxConcurrentDownloads    int
        MaxConcurrentUploads      int
        MaxDownloadAttempts       int
        ReferenceStore            refstore.Store
        RegistryService           distribution.RegistryResolver
        ContentStore              content.Store
        Leases                    leases.Manager
        ContentNamespace          string
}

// NewImageService returns a new ImageService from a configuration
func NewImageService(config ImageServiceConfig) *ImageService {
        return &ImageService{
                containers:                config.ContainerStore,
                distributionMetadataStore: config.DistributionMetadataStore,
                downloadManager:           xfer.NewLayerDownloadManager(config.LayerStore, config.MaxConcurrentDownloads, xfer.WithMaxDownloadAttempts(config.MaxDownloadAttempts)),
                eventsService:             config.EventsService,
                imageStore:                &imageStoreWithLease{Store: config.ImageStore, leases: config.Leases, ns: config.ContentNamespace},
                layerStore:                config.LayerStore,
                referenceStore:            config.ReferenceStore,
                registryService:           config.RegistryService,
                uploadManager:             xfer.NewLayerUploadManager(config.MaxConcurrentUploads),
                leases:                    config.Leases,
                content:                   config.ContentStore,
                contentNamespace:          config.ContentNamespace,
        }
}

// ImageService provides a backend for image management
type ImageService struct {
        containers                containerStore
        distributionMetadataStore metadata.Store
        downloadManager           *xfer.LayerDownloadManager
        eventsService             *daemonevents.Events
        imageStore                image.Store
        layerStore                layer.Store
        pruneRunning              atomic.Bool
        referenceStore            refstore.Store
        registryService           distribution.RegistryResolver
        uploadManager             *xfer.LayerUploadManager
        leases                    leases.Manager
        content                   content.Store
        contentNamespace          string
}

// DistributionServices provides daemon image storage services
type DistributionServices struct {
        DownloadManager   *xfer.LayerDownloadManager
        V2MetadataService metadata.V2MetadataService
        LayerStore        layer.Store
        ImageStore        image.Store
        ReferenceStore    refstore.Store
}

// DistributionServices return services controlling daemon image storage
func (i *ImageService) DistributionServices() DistributionServices {
        return DistributionServices{
                DownloadManager:   i.downloadManager,
                V2MetadataService: metadata.NewV2MetadataService(i.distributionMetadataStore),
                LayerStore:        i.layerStore,
                ImageStore:        i.imageStore,
                ReferenceStore:    i.referenceStore,
        }
}

// CountImages returns the number of images stored by ImageService
// called from info.go
func (i *ImageService) CountImages(ctx context.Context) int {
        return i.imageStore.Len()
}

// Children returns the children image.IDs for a parent image.
// called from list.go to filter containers
// TODO: refactor to expose an ancestry for image.ID?
func (i *ImageService) Children(_ context.Context, id image.ID) ([]image.ID, error) {
        return i.imageStore.Children(id), nil
}

// CreateLayer creates a filesystem layer for a container.
// called from create.go
// TODO: accept an opt struct instead of container?
func (i *ImageService) CreateLayer(container *container.Container, initFunc layer.MountInit) (container.RWLayer, error) {
        var img *image.Image
        if container.ImageID != "" {
                containerImg, err := i.imageStore.Get(container.ImageID)
                if err != nil {
                        return nil, err
                }
                img = containerImg
        }

        rwLayerOpts := &layer.CreateRWLayerOpts{
                MountLabel: container.MountLabel,
                InitFunc:   initFunc,
                StorageOpt: container.HostConfig.StorageOpt,
        }

        return i.CreateLayerFromImage(img, container.ID, rwLayerOpts)
}

// CreateLayerFromImage creates a file system from an arbitrary image
// Used to mount an image inside another
func (i *ImageService) CreateLayerFromImage(img *image.Image, layerName string, rwLayerOpts *layer.CreateRWLayerOpts) (container.RWLayer, error) {
        var layerID layer.ChainID
        if img != nil {
                layerID = img.RootFS.ChainID()
        }

        return i.layerStore.CreateRWLayer(layerName, layerID, rwLayerOpts)
}

// GetLayerByID returns a layer by ID
// called from daemon.go Daemon.restore().
func (i *ImageService) GetLayerByID(cid string) (container.RWLayer, error) {
        return i.layerStore.GetRWLayer(cid)
}

// LayerStoreStatus returns the status for each layer store
// called from info.go
func (i *ImageService) LayerStoreStatus() [][2]string {
        return i.layerStore.DriverStatus()
}

// GetLayerMountID returns the mount ID for a layer
// called from daemon.go Daemon.Shutdown(), and Daemon.Cleanup() (cleanup is actually containerCleanup)
// TODO: needs to be refactored to Unmount (see callers), or removed and replaced with GetLayerByID
func (i *ImageService) GetLayerMountID(cid string) (string, error) {
        return i.layerStore.GetMountID(cid)
}

// Cleanup resources before the process is shutdown.
// called from daemon.go Daemon.Shutdown()
func (i *ImageService) Cleanup() error {
        if err := i.layerStore.Cleanup(); err != nil {
                return errors.Wrap(err, "error during layerStore.Cleanup()")
        }
        return nil
}

// StorageDriver returns the name of the storage driver used by the ImageService.
func (i *ImageService) StorageDriver() string {
        return i.layerStore.DriverName()
}

// ReleaseLayer releases a layer allowing it to be removed
// called from delete.go Daemon.cleanupContainer().
func (i *ImageService) ReleaseLayer(rwlayer container.RWLayer) error {
        l, ok := rwlayer.(layer.RWLayer)
        if !ok {
                return fmt.Errorf("unexpected RWLayer type: %T", rwlayer)
        }

        metaData, err := i.layerStore.ReleaseRWLayer(l)
        for _, m := range metaData {
                log.G(context.TODO()).WithField("chainID", m.ChainID).Infof("release RWLayer: cleaned up layer %s", m.ChainID)
        }
        if err != nil && !errors.Is(err, layer.ErrMountDoesNotExist) && !errors.Is(err, os.ErrNotExist) {
                return errors.Wrapf(err, "driver %q failed to remove root filesystem",
                        i.layerStore.DriverName())
        }
        return nil
}

// ImageDiskUsage returns the number of bytes used by content and layer stores
// called from disk_usage.go
func (i *ImageService) ImageDiskUsage(ctx context.Context) (int64, error) {
        var allLayersSize int64
        layerRefs := i.getLayerRefs()
        allLayers := i.layerStore.Map()
        for _, l := range allLayers {
                select {
                case <-ctx.Done():
                        return allLayersSize, ctx.Err()
                default:
                        size := l.DiffSize()
                        if _, ok := layerRefs[l.ChainID()]; ok {
                                allLayersSize += size
                        }
                }
        }
        return allLayersSize, nil
}

func (i *ImageService) getLayerRefs() map[layer.ChainID]int {
        tmpImages := i.imageStore.Map()
        layerRefs := map[layer.ChainID]int{}
        for id, img := range tmpImages {
                dgst := digest.Digest(id)
                if len(i.referenceStore.References(dgst)) == 0 && len(i.imageStore.Children(id)) != 0 {
                        continue
                }

                rootFS := *img.RootFS
                rootFS.DiffIDs = nil
                for _, id := range img.RootFS.DiffIDs {
                        rootFS.Append(id)
                        chid := rootFS.ChainID()
                        layerRefs[chid]++
                }
        }

        return layerRefs
}

// UpdateConfig values
//
// called from reload.go
func (i *ImageService) UpdateConfig(maxDownloads, maxUploads int) {
        if i.downloadManager != nil && maxDownloads != 0 {
                i.downloadManager.SetConcurrency(maxDownloads)
        }
        if i.uploadManager != nil && maxUploads != 0 {
                i.uploadManager.SetConcurrency(maxUploads)
        }
}

package images

import (
        "context"
        "sync"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/pkg/namespaces"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/docker/docker/distribution"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

const imageKeyPrefix = "moby-image-"

func imageKey(dgst string) string {
        return imageKeyPrefix + dgst
}

// imageStoreWithLease wraps the configured image store with one that deletes the lease
// registered for a given image ID, if one exists
//
// This is used by the main image service to wrap delete calls to the real image store.
type imageStoreWithLease struct {
        image.Store
        leases leases.Manager

        // Normally we'd pass namespace down through a context.Context, however...
        // The interface for image store doesn't allow this, so we store it here.
        ns string
}

func (s *imageStoreWithLease) Delete(id image.ID) ([]layer.Metadata, error) {
        ctx := namespaces.WithNamespace(context.TODO(), s.ns)
        if err := s.leases.Delete(ctx, leases.Lease{ID: imageKey(id.String())}); err != nil && !cerrdefs.IsNotFound(err) {
                return nil, errors.Wrap(err, "error deleting lease")
        }
        return s.Store.Delete(id)
}

// imageStoreForPull is created for each pull It wraps an underlying image store
// to handle registering leases for content fetched in a single image pull.
type imageStoreForPull struct {
        distribution.ImageConfigStore
        leases   leases.Manager
        ingested *contentStoreForPull
}

func (s *imageStoreForPull) Put(ctx context.Context, config []byte) (digest.Digest, error) {
        id, err := s.ImageConfigStore.Put(ctx, config)
        if err != nil {
                return "", err
        }
        return id, s.updateLease(ctx, id)
}

func (s *imageStoreForPull) Get(ctx context.Context, dgst digest.Digest) ([]byte, error) {
        id, err := s.ImageConfigStore.Get(ctx, dgst)
        if err != nil {
                return nil, err
        }
        return id, s.updateLease(ctx, dgst)
}

func (s *imageStoreForPull) updateLease(ctx context.Context, dgst digest.Digest) error {
        leaseID := imageKey(dgst.String())
        lease, err := s.leases.Create(ctx, leases.WithID(leaseID))
        if err != nil {
                if !cerrdefs.IsAlreadyExists(err) {
                        return errors.Wrap(err, "error creating lease")
                }
                lease = leases.Lease{ID: leaseID}
        }

        digested := s.ingested.getDigested()
        resource := leases.Resource{
                Type: "content",
        }
        for _, dgst := range digested {
                log.G(ctx).WithFields(log.Fields{
                        "digest": dgst,
                        "lease":  lease.ID,
                }).Debug("Adding content digest to lease")

                resource.ID = dgst.String()
                if err := s.leases.AddResource(ctx, lease, resource); err != nil {
                        return errors.Wrapf(err, "error adding content digest to lease: %s", dgst)
                }
        }
        return nil
}

// contentStoreForPull is used to wrap the configured content store to
// add lease management for a single `pull`
// It stores all committed digests so that `imageStoreForPull` can add
// the digested resources to the lease for an image.
type contentStoreForPull struct {
        distribution.ContentStore
        leases leases.Manager

        mu       sync.Mutex
        digested []digest.Digest
}

func (c *contentStoreForPull) addDigested(dgst digest.Digest) {
        c.mu.Lock()
        c.digested = append(c.digested, dgst)
        c.mu.Unlock()
}

func (c *contentStoreForPull) getDigested() []digest.Digest {
        c.mu.Lock()
        digested := make([]digest.Digest, len(c.digested))
        copy(digested, c.digested)
        c.mu.Unlock()
        return digested
}

func (c *contentStoreForPull) Writer(ctx context.Context, opts ...content.WriterOpt) (content.Writer, error) {
        w, err := c.ContentStore.Writer(ctx, opts...)
        if err != nil {
                if cerrdefs.IsAlreadyExists(err) {
                        var cfg content.WriterOpts
                        for _, o := range opts {
                                if err := o(&cfg); err != nil {
                                        return nil, err
                                }
                        }
                        c.addDigested(cfg.Desc.Digest)
                }
                return nil, err
        }
        return &contentWriter{
                cs:     c,
                Writer: w,
        }, nil
}

type contentWriter struct {
        cs *contentStoreForPull
        content.Writer
}

func (w *contentWriter) Commit(ctx context.Context, size int64, expected digest.Digest, opts ...content.Opt) error {
        err := w.Writer.Commit(ctx, size, expected, opts...)
        if err == nil || cerrdefs.IsAlreadyExists(err) {
                w.cs.addDigested(expected)
        }
        return err
}

//go:build linux || freebsd

package initlayer

import (
        "os"
        "path/filepath"
        "strings"

        "github.com/moby/sys/user"
        "golang.org/x/sys/unix"
)

// Setup populates a directory with mountpoints suitable
// for bind-mounting things into the container.
//
// This extra layer is used by all containers as the top-most ro layer. It protects
// the container from unwanted side-effects on the rw layer.
func Setup(initLayerFs string, uid int, gid int) error {
        // Since all paths are local to the container, we can just extract initLayerFs.Path()
        initLayer := initLayerFs

        for pth, typ := range map[string]string{
                "/dev/pts":         "dir",
                "/dev/shm":         "dir",
                "/proc":            "dir",
                "/sys":             "dir",
                "/.dockerenv":      "file",
                "/etc/resolv.conf": "file",
                "/etc/hosts":       "file",
                "/etc/hostname":    "file",
                "/dev/console":     "file",
                "/etc/mtab":        "/proc/mounts",
        } {
                parts := strings.Split(pth, "/")
                prev := "/"
                for _, p := range parts[1:] {
                        prev = filepath.Join(prev, p)
                        unix.Unlink(filepath.Join(initLayer, prev))
                }

                if _, err := os.Stat(filepath.Join(initLayer, pth)); err != nil {
                        if os.IsNotExist(err) {
                                if err := user.MkdirAllAndChown(filepath.Join(initLayer, filepath.Dir(pth)), 0o755, uid, gid, user.WithOnlyNew); err != nil {
                                        return err
                                }
                                switch typ {
                                case "dir":
                                        if err := user.MkdirAllAndChown(filepath.Join(initLayer, pth), 0o755, uid, gid, user.WithOnlyNew); err != nil {
                                                return err
                                        }
                                case "file":
                                        f, err := os.OpenFile(filepath.Join(initLayer, pth), os.O_CREATE, 0o755)
                                        if err != nil {
                                                return err
                                        }
                                        f.Chown(uid, gid)
                                        f.Close()
                                default:
                                        if err := os.Symlink(typ, filepath.Join(initLayer, pth)); err != nil {
                                                return err
                                        }
                                }
                        } else {
                                return err
                        }
                }
        }

        // Layer is ready to use, if it wasn't before.
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package containerimage

import (
        "context"
        "encoding/json"
        "fmt"
        "io"
        "path"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        c8dimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/containerd/v2/pkg/gc"
        c8dreference "github.com/containerd/containerd/v2/pkg/reference"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/distribution/reference"
        dimages "github.com/docker/docker/daemon/images"
        "github.com/docker/docker/distribution/metadata"
        "github.com/docker/docker/distribution/xfer"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        pkgprogress "github.com/docker/docker/pkg/progress"
        refstore "github.com/docker/docker/reference"
        "github.com/moby/buildkit/cache"
        "github.com/moby/buildkit/client"
        "github.com/moby/buildkit/client/llb/sourceresolver"
        "github.com/moby/buildkit/session"
        "github.com/moby/buildkit/solver"
        "github.com/moby/buildkit/solver/pb"
        "github.com/moby/buildkit/source"
        "github.com/moby/buildkit/source/containerimage"
        srctypes "github.com/moby/buildkit/source/types"
        "github.com/moby/buildkit/sourcepolicy"
        spb "github.com/moby/buildkit/sourcepolicy/pb"
        "github.com/moby/buildkit/util/flightcontrol"
        "github.com/moby/buildkit/util/imageutil"
        "github.com/moby/buildkit/util/leaseutil"
        "github.com/moby/buildkit/util/progress"
        "github.com/moby/buildkit/util/resolver"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/identity"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
        "golang.org/x/time/rate"
)

// SourceOpt is options for creating the image source
type SourceOpt struct {
        ContentStore    content.Store
        CacheAccessor   cache.Accessor
        ReferenceStore  refstore.Store
        DownloadManager *xfer.LayerDownloadManager
        MetadataStore   metadata.V2MetadataService
        ImageStore      image.Store
        RegistryHosts   docker.RegistryHosts
        LayerStore      layer.Store
        LeaseManager    leases.Manager
        GarbageCollect  func(ctx context.Context) (gc.Stats, error)
}

// Source is the source implementation for accessing container images
type Source struct {
        SourceOpt
        g flightcontrol.Group[*resolveRemoteResult]
}

// NewSource creates a new image source
func NewSource(opt SourceOpt) (*Source, error) {
        return &Source{SourceOpt: opt}, nil
}

// Schemes returns a list of SourceOp identifier schemes that this source
// should match.
func (is *Source) Schemes() []string {
        return []string{srctypes.DockerImageScheme}
}

// Identifier constructs an Identifier from the given scheme, ref, and attrs,
// all of which come from a SourceOp.
func (is *Source) Identifier(scheme, ref string, attrs map[string]string, platform *pb.Platform) (source.Identifier, error) {
        return is.registryIdentifier(ref, attrs, platform)
}

// Copied from github.com/moby/buildkit/source/containerimage/source.go
func (is *Source) registryIdentifier(ref string, attrs map[string]string, platform *pb.Platform) (source.Identifier, error) {
        id, err := containerimage.NewImageIdentifier(ref)
        if err != nil {
                return nil, err
        }

        if platform != nil {
                id.Platform = &ocispec.Platform{
                        OS:           platform.OS,
                        Architecture: platform.Architecture,
                        Variant:      platform.Variant,
                        OSVersion:    platform.OSVersion,
                }
                if platform.OSFeatures != nil {
                        id.Platform.OSFeatures = append([]string{}, platform.OSFeatures...)
                }
        }

        for k, v := range attrs {
                switch k {
                case pb.AttrImageResolveMode:
                        rm, err := resolver.ParseImageResolveMode(v)
                        if err != nil {
                                return nil, err
                        }
                        id.ResolveMode = rm
                case pb.AttrImageRecordType:
                        rt, err := parseImageRecordType(v)
                        if err != nil {
                                return nil, err
                        }
                        id.RecordType = rt
                case pb.AttrImageLayerLimit:
                        l, err := strconv.Atoi(v)
                        if err != nil {
                                return nil, errors.Wrapf(err, "invalid layer limit %s", v)
                        }
                        if l <= 0 {
                                return nil, errors.Errorf("invalid layer limit %s", v)
                        }
                        id.LayerLimit = &l
                }
        }

        return id, nil
}

func parseImageRecordType(v string) (client.UsageRecordType, error) {
        switch client.UsageRecordType(v) {
        case "", client.UsageRecordTypeRegular:
                return client.UsageRecordTypeRegular, nil
        case client.UsageRecordTypeInternal:
                return client.UsageRecordTypeInternal, nil
        case client.UsageRecordTypeFrontend:
                return client.UsageRecordTypeFrontend, nil
        default:
                return "", errors.Errorf("invalid record type %s", v)
        }
}

func (is *Source) resolveLocal(refStr string) (*image.Image, error) {
        ref, err := reference.ParseNormalizedNamed(refStr)
        if err != nil {
                return nil, err
        }
        dgst, err := is.ReferenceStore.Get(ref)
        if err != nil {
                return nil, err
        }
        img, err := is.ImageStore.Get(image.ID(dgst))
        if err != nil {
                return nil, err
        }
        return img, nil
}

type resolveRemoteResult struct {
        ref  string
        dgst digest.Digest
        dt   []byte
}

func (is *Source) resolveRemote(ctx context.Context, ref string, platform *ocispec.Platform, sm *session.Manager, g session.Group) (digest.Digest, []byte, error) {
        p := platforms.DefaultSpec()
        if platform != nil {
                p = *platform
        }
        // key is used to synchronize resolutions that can happen in parallel when doing multi-stage.
        key := "getconfig::" + ref + "::" + platforms.FormatAll(p)
        res, err := is.g.Do(ctx, key, func(ctx context.Context) (*resolveRemoteResult, error) {
                res := resolver.DefaultPool.GetResolver(is.RegistryHosts, ref, "pull", sm, g)
                dgst, dt, err := imageutil.Config(ctx, ref, res, is.ContentStore, is.LeaseManager, platform)
                if err != nil {
                        return nil, err
                }
                return &resolveRemoteResult{ref: ref, dgst: dgst, dt: dt}, nil
        })
        if err != nil {
                return "", nil, err
        }
        return res.dgst, res.dt, nil
}

// ResolveImageConfig returns image config for an image
func (is *Source) ResolveImageConfig(ctx context.Context, ref string, opt sourceresolver.Opt, sm *session.Manager, g session.Group) (digest.Digest, []byte, error) {
        if opt.ImageOpt == nil {
                return "", nil, fmt.Errorf("can only resolve an image: %v, opt: %v", ref, opt)
        }
        ref, err := applySourcePolicies(ctx, ref, opt.SourcePolicies)
        if err != nil {
                return "", nil, err
        }
        resolveMode, err := resolver.ParseImageResolveMode(opt.ImageOpt.ResolveMode)
        if err != nil {
                return "", nil, err
        }
        switch resolveMode {
        case resolver.ResolveModeForcePull:
                return is.resolveRemote(ctx, ref, opt.Platform, sm, g)
                // TODO: pull should fallback to local in case of failure to allow offline behavior
                // the fallback doesn't work currently
                /*
                        if err == nil {
                                return dgst, dt, err
                        }
                        // fallback to local
                        dt, err = is.resolveLocal(ref)
                        return "", dt, err
                */

        case resolver.ResolveModeDefault:
                // default == prefer local, but in the future could be smarter
                fallthrough
        case resolver.ResolveModePreferLocal:
                img, err := is.resolveLocal(ref)
                if err == nil {
                        if opt.Platform != nil && !platformMatches(img, opt.Platform) {
                                log.G(ctx).WithField("ref", ref).Debugf("Requested build platform %s does not match local image platform %s, checking remote",
                                        path.Join(opt.Platform.OS, opt.Platform.Architecture, opt.Platform.Variant),
                                        path.Join(img.OS, img.Architecture, img.Variant),
                                )
                        } else {
                                return "", img.RawJSON(), err
                        }
                }
                // fallback to remote
                return is.resolveRemote(ctx, ref, opt.Platform, sm, g)
        }
        // should never happen
        return "", nil, fmt.Errorf("builder cannot resolve image %s: invalid mode %q", ref, opt.ImageOpt.ResolveMode)
}

// Resolve returns access to pulling for an identifier
func (is *Source) Resolve(ctx context.Context, id source.Identifier, sm *session.Manager, vtx solver.Vertex) (source.SourceInstance, error) {
        imageIdentifier, ok := id.(*containerimage.ImageIdentifier)
        if !ok {
                return nil, errors.Errorf("invalid image identifier %v", id)
        }

        platform := platforms.DefaultSpec()
        if imageIdentifier.Platform != nil {
                platform = *imageIdentifier.Platform
        }

        p := &puller{
                src: imageIdentifier,
                is:  is,
                // resolver: is.getResolver(is.RegistryHosts, imageIdentifier.Reference.String(), sm, g),
                platform: platform,
                sm:       sm,
        }
        return p, nil
}

type puller struct {
        is               *Source
        resolveLocalOnce sync.Once
        g                flightcontrol.Group[struct{}]
        src              *containerimage.ImageIdentifier
        desc             ocispec.Descriptor
        ref              string
        config           []byte
        platform         ocispec.Platform
        sm               *session.Manager
}

func (p *puller) resolver(g session.Group) remotes.Resolver {
        return resolver.DefaultPool.GetResolver(p.is.RegistryHosts, p.src.Reference.String(), "pull", p.sm, g)
}

func (p *puller) mainManifestKey(platform ocispec.Platform) (digest.Digest, error) {
        dt, err := json.Marshal(struct {
                Digest  digest.Digest
                OS      string
                Arch    string
                Variant string `json:",omitempty"`
        }{
                Digest:  p.desc.Digest,
                OS:      platform.OS,
                Arch:    platform.Architecture,
                Variant: platform.Variant,
        })
        if err != nil {
                return "", err
        }
        return digest.FromBytes(dt), nil
}

func (p *puller) resolveLocal() {
        p.resolveLocalOnce.Do(func() {
                dgst := p.src.Reference.Digest()
                if dgst != "" {
                        info, err := p.is.ContentStore.Info(context.TODO(), dgst)
                        if err == nil {
                                p.ref = p.src.Reference.String()
                                desc := ocispec.Descriptor{
                                        Size:   info.Size,
                                        Digest: dgst,
                                }
                                ra, err := p.is.ContentStore.ReaderAt(context.TODO(), desc)
                                if err == nil {
                                        mt, err := imageutil.DetectManifestMediaType(ra)
                                        if err == nil {
                                                desc.MediaType = mt
                                                p.desc = desc
                                        }
                                }
                        }
                }

                if p.src.ResolveMode == resolver.ResolveModeDefault || p.src.ResolveMode == resolver.ResolveModePreferLocal {
                        ref := p.src.Reference.String()
                        img, err := p.is.resolveLocal(ref)
                        if err == nil {
                                if !platformMatches(img, &p.platform) {
                                        log.G(context.TODO()).WithField("ref", ref).Debugf("Requested build platform %s does not match local image platform %s, not resolving",
                                                path.Join(p.platform.OS, p.platform.Architecture, p.platform.Variant),
                                                path.Join(img.OS, img.Architecture, img.Variant),
                                        )
                                } else {
                                        p.config = img.RawJSON()
                                }
                        }
                }
        })
}

func (p *puller) resolve(ctx context.Context, g session.Group) error {
        _, err := p.g.Do(ctx, "", func(ctx context.Context) (_ struct{}, retErr error) {
                resolveProgressDone := oneOffProgress(ctx, "resolve "+p.src.Reference.String())
                defer func() {
                        _ = resolveProgressDone(retErr)
                }()

                ref, err := reference.ParseNormalizedNamed(p.src.Reference.String())
                if err != nil {
                        return struct{}{}, err
                }

                if p.desc.Digest == "" && p.config == nil {
                        origRef, desc, err := p.resolver(g).Resolve(ctx, ref.String())
                        if err != nil {
                                return struct{}{}, err
                        }

                        p.desc = desc
                        p.ref = origRef
                }

                // Schema 1 manifests cannot be resolved to an image config
                // since the conversion must take place after all the content
                // has been read.
                // It may be possible to have a mapping between schema 1 manifests
                // and the schema 2 manifests they are converted to.
                if p.config == nil && p.desc.MediaType != c8dimages.MediaTypeDockerSchema1Manifest {
                        refWithDigest, err := reference.WithDigest(ref, p.desc.Digest)
                        if err != nil {
                                return struct{}{}, err
                        }
                        _, dt, err := p.is.ResolveImageConfig(ctx, refWithDigest.String(), sourceresolver.Opt{
                                Platform: &p.platform,
                                ImageOpt: &sourceresolver.ResolveImageOpt{
                                        ResolveMode: p.src.ResolveMode.String(),
                                },
                        }, p.sm, g)
                        if err != nil {
                                return struct{}{}, err
                        }

                        p.ref = refWithDigest.String()
                        p.config = dt
                }
                return struct{}{}, nil
        })
        return err
}

func (p *puller) CacheKey(ctx context.Context, g session.Group, index int) (string, string, solver.CacheOpts, bool, error) {
        p.resolveLocal()

        if p.desc.Digest != "" && index == 0 {
                dgst, err := p.mainManifestKey(p.platform)
                if err != nil {
                        return "", "", nil, false, err
                }
                return dgst.String(), p.desc.Digest.String(), nil, false, nil
        }

        if p.config != nil {
                k := cacheKeyFromConfig(p.config).String()
                if k == "" {
                        return digest.FromBytes(p.config).String(), digest.FromBytes(p.config).String(), nil, true, nil
                }
                return k, k, nil, true, nil
        }

        if err := p.resolve(ctx, g); err != nil {
                return "", "", nil, false, err
        }

        if p.desc.Digest != "" && index == 0 {
                dgst, err := p.mainManifestKey(p.platform)
                if err != nil {
                        return "", "", nil, false, err
                }
                return dgst.String(), p.desc.Digest.String(), nil, false, nil
        }

        if len(p.config) == 0 && p.desc.MediaType != c8dimages.MediaTypeDockerSchema1Manifest {
                return "", "", nil, false, errors.Errorf("invalid empty config file resolved for %s", p.src.Reference.String())
        }

        k := cacheKeyFromConfig(p.config).String()
        if k == "" || p.desc.MediaType == c8dimages.MediaTypeDockerSchema1Manifest {
                dgst, err := p.mainManifestKey(p.platform)
                if err != nil {
                        return "", "", nil, false, err
                }
                return dgst.String(), p.desc.Digest.String(), nil, true, nil
        }

        return k, k, nil, true, nil
}

func (p *puller) getRef(ctx context.Context, diffIDs []layer.DiffID, opts ...cache.RefOption) (cache.ImmutableRef, error) {
        var parent cache.ImmutableRef
        if len(diffIDs) > 1 {
                var err error
                parent, err = p.getRef(ctx, diffIDs[:len(diffIDs)-1], opts...)
                if err != nil {
                        return nil, err
                }
                defer parent.Release(context.TODO())
        }
        return p.is.CacheAccessor.GetByBlob(ctx, ocispec.Descriptor{
                Annotations: map[string]string{
                        "containerd.io/uncompressed": diffIDs[len(diffIDs)-1].String(),
                },
        }, parent, opts...)
}

func (p *puller) Snapshot(ctx context.Context, g session.Group) (cache.ImmutableRef, error) {
        p.resolveLocal()
        if len(p.config) == 0 {
                if err := p.resolve(ctx, g); err != nil {
                        return nil, err
                }
        }

        if p.config != nil {
                img, err := p.is.ImageStore.Get(image.ID(digest.FromBytes(p.config)))
                if err == nil {
                        if len(img.RootFS.DiffIDs) == 0 {
                                return nil, nil
                        }
                        l, err := p.is.LayerStore.Get(img.RootFS.ChainID())
                        if err == nil {
                                layer.ReleaseAndLog(p.is.LayerStore, l)
                                ref, err := p.getRef(ctx, img.RootFS.DiffIDs, cache.WithDescription(fmt.Sprintf("from local %s", p.ref)))
                                if err != nil {
                                        return nil, err
                                }
                                return ref, nil
                        }
                }
        }

        ongoing := newJobs(p.ref)

        ctx, done, err := leaseutil.WithLease(ctx, p.is.LeaseManager, leases.WithExpiration(5*time.Minute), leaseutil.MakeTemporary)
        if err != nil {
                return nil, err
        }
        defer func() {
                done(context.TODO())
                if p.is.GarbageCollect != nil {
                        go p.is.GarbageCollect(context.TODO())
                }
        }()

        pctx, stopProgress := context.WithCancel(ctx)

        pw, _, ctx := progress.NewFromContext(ctx)
        defer pw.Close()

        progressDone := make(chan struct{})
        go func() {
                showProgress(pctx, ongoing, p.is.ContentStore, pw)
                close(progressDone)
        }()
        defer func() {
                <-progressDone
        }()

        fetcher, err := p.resolver(g).Fetcher(ctx, p.ref)
        if err != nil {
                stopProgress()
                return nil, err
        }

        platform := platforms.Only(p.platform)

        var nonLayers []digest.Digest

        var handlers []c8dimages.Handler
        if p.desc.MediaType == c8dimages.MediaTypeDockerSchema1Manifest {
                stopProgress()
                // similar to [github.com/docker/docker/distribution/DeprecatedSchema1ImageError]
                errMsg := "support for Docker Image Format v1 and Docker Image manifest version 2, schema 1 has been removed in Docker Engine v28.2. " +
                        "More information at https://docs.docker.com/go/deprecated-image-specs/"
                return nil, cerrdefs.ErrInvalidArgument.WithMessage(errMsg)
                // TODO: Optimize to do dispatch and integrate pulling with download manager,
                // leverage existing blob mapping and layer storage
        } else {
                // TODO: need a wrapper snapshot interface that combines content
                // and snapshots as 1) buildkit shouldn't have a dependency on contentstore
                // or 2) cachemanager should manage the contentstore
                handlers = append(handlers, c8dimages.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                        switch desc.MediaType {
                        case c8dimages.MediaTypeDockerSchema2Manifest, ocispec.MediaTypeImageManifest,
                                c8dimages.MediaTypeDockerSchema2ManifestList, ocispec.MediaTypeImageIndex,
                                c8dimages.MediaTypeDockerSchema2Config, ocispec.MediaTypeImageConfig:
                                nonLayers = append(nonLayers, desc.Digest)
                        default:
                                return nil, c8dimages.ErrSkipDesc
                        }
                        ongoing.add(desc)
                        return nil, nil
                }))

                // Get all the children for a descriptor
                childrenHandler := c8dimages.ChildrenHandler(p.is.ContentStore)
                // Filter the children by the platform
                childrenHandler = c8dimages.FilterPlatforms(childrenHandler, platform)
                // Limit manifests pulled to the best match in an index
                childrenHandler = c8dimages.LimitManifests(childrenHandler, platform, 1)

                handlers = append(handlers,
                        remotes.FetchHandler(p.is.ContentStore, fetcher),
                        childrenHandler,
                )
        }

        if err := c8dimages.Dispatch(ctx, c8dimages.Handlers(handlers...), nil, p.desc); err != nil {
                stopProgress()
                return nil, err
        }
        defer stopProgress()

        mfst, err := c8dimages.Manifest(ctx, p.is.ContentStore, p.desc, platform)
        if err != nil {
                return nil, err
        }

        config, err := c8dimages.Config(ctx, p.is.ContentStore, p.desc, platform)
        if err != nil {
                return nil, err
        }

        dt, err := content.ReadBlob(ctx, p.is.ContentStore, config)
        if err != nil {
                return nil, err
        }

        var img ocispec.Image
        if err := json.Unmarshal(dt, &img); err != nil {
                return nil, err
        }

        if len(mfst.Layers) != len(img.RootFS.DiffIDs) {
                return nil, errors.Errorf("invalid config for manifest")
        }

        pchan := make(chan pkgprogress.Progress, 10)
        defer close(pchan)

        go func() {
                m := map[string]struct {
                        st      time.Time
                        limiter *rate.Limiter
                }{}
                for p := range pchan {
                        if p.Action == "Extracting" {
                                st, ok := m[p.ID]
                                if !ok {
                                        st.st = time.Now()
                                        st.limiter = rate.NewLimiter(rate.Every(100*time.Millisecond), 1)
                                        m[p.ID] = st
                                }
                                var end *time.Time
                                if p.LastUpdate || st.limiter.Allow() {
                                        if p.LastUpdate {
                                                tm := time.Now()
                                                end = &tm
                                        }
                                        _ = pw.Write("extracting "+p.ID, progress.Status{
                                                Action:    "extract",
                                                Started:   &st.st,
                                                Completed: end,
                                        })
                                }
                        }
                }
        }()

        if len(mfst.Layers) == 0 {
                return nil, nil
        }

        layers := make([]xfer.DownloadDescriptor, 0, len(mfst.Layers))

        for i, desc := range mfst.Layers {
                if err := desc.Digest.Validate(); err != nil {
                        return nil, errors.Wrap(err, "layer digest could not be validated")
                }
                ongoing.add(desc)
                layers = append(layers, &layerDescriptor{
                        desc:    desc,
                        diffID:  img.RootFS.DiffIDs[i],
                        fetcher: fetcher,
                        ref:     p.src.Reference,
                        is:      p.is,
                })
        }

        defer func() {
                <-progressDone
        }()

        rootFS, release, err := p.is.DownloadManager.Download(ctx, layers, pkgprogress.ChanOutput(pchan))
        stopProgress()
        if err != nil {
                return nil, err
        }

        ref, err := p.getRef(ctx, rootFS.DiffIDs, cache.WithDescription(fmt.Sprintf("pulled from %s", p.ref)))
        release()
        if err != nil {
                return nil, err
        }

        // keep manifest blobs until ref is alive for cache
        for _, nl := range nonLayers {
                if err := p.is.LeaseManager.AddResource(ctx, leases.Lease{ID: ref.ID()}, leases.Resource{
                        ID:   nl.String(),
                        Type: "content",
                }); err != nil {
                        return nil, err
                }
        }

        // TODO: handle windows layers for cross platform builds

        if p.src.RecordType != "" && ref.GetRecordType() == "" {
                if err := ref.SetRecordType(p.src.RecordType); err != nil {
                        ref.Release(context.TODO())
                        return nil, err
                }
        }

        return ref, nil
}

// Fetch(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error)
type layerDescriptor struct {
        is      *Source
        fetcher remotes.Fetcher
        desc    ocispec.Descriptor
        diffID  layer.DiffID
        ref     c8dreference.Spec
}

func (ld *layerDescriptor) Key() string {
        return "v2:" + ld.desc.Digest.String()
}

func (ld *layerDescriptor) ID() string {
        return ld.desc.Digest.String()
}

func (ld *layerDescriptor) DiffID() (layer.DiffID, error) {
        return ld.diffID, nil
}

func (ld *layerDescriptor) Download(ctx context.Context, progressOutput pkgprogress.Output) (io.ReadCloser, int64, error) {
        rc, err := ld.fetcher.Fetch(ctx, ld.desc)
        if err != nil {
                return nil, 0, err
        }
        defer rc.Close()

        refKey := remotes.MakeRefKey(ctx, ld.desc)

        ld.is.ContentStore.Abort(ctx, refKey)

        if err := content.WriteBlob(ctx, ld.is.ContentStore, refKey, rc, ld.desc); err != nil {
                ld.is.ContentStore.Abort(ctx, refKey)
                return nil, 0, err
        }

        ra, err := ld.is.ContentStore.ReaderAt(ctx, ld.desc)
        if err != nil {
                return nil, 0, err
        }

        return io.NopCloser(content.NewReader(ra)), ld.desc.Size, nil
}

func (ld *layerDescriptor) Close() {
        // ld.is.ContentStore.Delete(context.TODO(), ld.desc.Digest))
}

func (ld *layerDescriptor) Registered(diffID layer.DiffID) {
        // Cache mapping from this layer's DiffID to the blobsum
        ld.is.MetadataStore.Add(diffID, metadata.V2Metadata{Digest: ld.desc.Digest, SourceRepository: ld.ref.Locator})
}

func showProgress(ctx context.Context, ongoing *jobs, cs content.Store, pw progress.Writer) {
        var (
                ticker   = time.NewTicker(100 * time.Millisecond)
                statuses = map[string]statusInfo{}
                done     bool
        )
        defer ticker.Stop()

        for {
                select {
                case <-ticker.C:
                case <-ctx.Done():
                        done = true
                }

                resolved := "resolved"
                if !ongoing.isResolved() {
                        resolved = "resolving"
                }
                statuses[ongoing.name] = statusInfo{
                        Ref:    ongoing.name,
                        Status: resolved,
                }

                actives := make(map[string]statusInfo)

                if !done {
                        active, err := cs.ListStatuses(ctx)
                        if err != nil {
                                // log.G(ctx).WithError(err).Error("active check failed")
                                continue
                        }
                        // update status of active entries!
                        for _, active := range active {
                                actives[active.Ref] = statusInfo{
                                        Ref:       active.Ref,
                                        Status:    "downloading",
                                        Offset:    active.Offset,
                                        Total:     active.Total,
                                        StartedAt: active.StartedAt,
                                        UpdatedAt: active.UpdatedAt,
                                }
                        }
                }

                // now, update the items in jobs that are not in active
                for _, j := range ongoing.jobs() {
                        refKey := remotes.MakeRefKey(ctx, j.Descriptor)
                        if a, ok := actives[refKey]; ok {
                                started := j.started
                                _ = pw.Write(j.Digest.String(), progress.Status{
                                        Action:  a.Status,
                                        Total:   int(a.Total),
                                        Current: int(a.Offset),
                                        Started: &started,
                                })
                                continue
                        }

                        if !j.done {
                                info, err := cs.Info(context.TODO(), j.Digest)
                                if err != nil {
                                        if cerrdefs.IsNotFound(err) {
                                                // _ = pw.Write(j.Digest.String(), progress.Status{
                                                //         Action: "waiting",
                                                // })
                                                continue
                                        }
                                } else {
                                        j.done = true
                                }

                                if done || j.done {
                                        started := j.started
                                        createdAt := info.CreatedAt
                                        _ = pw.Write(j.Digest.String(), progress.Status{
                                                Action:    "done",
                                                Current:   int(info.Size),
                                                Total:     int(info.Size),
                                                Completed: &createdAt,
                                                Started:   &started,
                                        })
                                }
                        }
                }
                if done {
                        return
                }
        }
}

// jobs provides a way of identifying the download keys for a particular task
// encountering during the pull walk.
//
// This is very minimal and will probably be replaced with something more
// featured.
type jobs struct {
        name     string
        added    map[digest.Digest]*job
        mu       sync.Mutex
        resolved bool
}

type job struct {
        ocispec.Descriptor
        done    bool
        started time.Time
}

func newJobs(name string) *jobs {
        return &jobs{
                name:  name,
                added: make(map[digest.Digest]*job),
        }
}

func (j *jobs) add(desc ocispec.Descriptor) {
        j.mu.Lock()
        defer j.mu.Unlock()

        if _, ok := j.added[desc.Digest]; ok {
                return
        }
        j.added[desc.Digest] = &job{
                Descriptor: desc,
                started:    time.Now(),
        }
}

func (j *jobs) jobs() []*job {
        j.mu.Lock()
        defer j.mu.Unlock()

        descs := make([]*job, 0, len(j.added))
        for _, j := range j.added {
                descs = append(descs, j)
        }
        return descs
}

func (j *jobs) isResolved() bool {
        j.mu.Lock()
        defer j.mu.Unlock()
        return j.resolved
}

type statusInfo struct {
        Ref       string
        Status    string
        Offset    int64
        Total     int64
        StartedAt time.Time
        UpdatedAt time.Time
}

func oneOffProgress(ctx context.Context, id string) func(err error) error {
        pw, _, _ := progress.NewFromContext(ctx)
        s := time.Now()
        st := progress.Status{
                Started: &s,
        }
        _ = pw.Write(id, st)
        return func(err error) error {
                // TODO: set error on status
                c := time.Now()
                st.Completed = &c
                _ = pw.Write(id, st)
                _ = pw.Close()
                return err
        }
}

// cacheKeyFromConfig returns a stable digest from image config. If image config
// is a known oci image we will use chainID of layers.
func cacheKeyFromConfig(dt []byte) digest.Digest {
        var img ocispec.Image
        err := json.Unmarshal(dt, &img)
        if err != nil {
                log.G(context.TODO()).WithError(err).Errorf("failed to unmarshal image config for cache key %v", err)
                return digest.FromBytes(dt)
        }
        if img.RootFS.Type != "layers" || len(img.RootFS.DiffIDs) == 0 {
                return ""
        }
        return identity.ChainID(img.RootFS.DiffIDs)
}

func platformMatches(img *image.Image, p *ocispec.Platform) bool {
        return dimages.OnlyPlatformWithFallback(*p).Match(ocispec.Platform{
                Architecture: img.Architecture,
                OS:           img.OS,
                OSVersion:    img.OSVersion,
                OSFeatures:   img.OSFeatures,
                Variant:      img.Variant,
        })
}

func applySourcePolicies(ctx context.Context, str string, spls []*spb.Policy) (string, error) {
        ref, err := c8dreference.Parse(str)
        if err != nil {
                return "", errors.WithStack(err)
        }
        op := &pb.SourceOp{
                Identifier: srctypes.DockerImageScheme + "://" + ref.String(),
        }

        mut, err := sourcepolicy.NewEngine(spls).Evaluate(ctx, op)
        if err != nil {
                return "", errors.Wrap(err, "could not resolve image due to policy")
        }

        if mut {
                t, newRef, ok := strings.Cut(op.GetIdentifier(), "://")
                if !ok {
                        return "", errors.Errorf("could not parse ref: %s", op.GetIdentifier())
                }
                if t != srctypes.DockerImageScheme {
                        return "", &imageutil.ResolveToNonImageError{Ref: str, Updated: newRef}
                }
                ref, err = c8dreference.Parse(newRef)
                if err != nil {
                        return "", errors.WithStack(err)
                }
        }
        return ref.String(), nil
}

package localinlinecache

import (
        "context"
        "encoding/json"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        c8dimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/distribution/reference"
        imagestore "github.com/docker/docker/image"
        refstore "github.com/docker/docker/reference"
        "github.com/moby/buildkit/cache/remotecache"
        registryremotecache "github.com/moby/buildkit/cache/remotecache/registry"
        v1 "github.com/moby/buildkit/cache/remotecache/v1"
        "github.com/moby/buildkit/session"
        "github.com/moby/buildkit/solver"
        "github.com/moby/buildkit/worker"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// ResolveCacheImporterFunc returns a resolver function for local inline cache
func ResolveCacheImporterFunc(sm *session.Manager, resolverFunc docker.RegistryHosts, cs content.Store, rs refstore.Store, is imagestore.Store) remotecache.ResolveCacheImporterFunc {
        upstream := registryremotecache.ResolveCacheImporterFunc(sm, cs, resolverFunc)

        return func(ctx context.Context, group session.Group, attrs map[string]string) (remotecache.Importer, ocispec.Descriptor, error) {
                if dt, err := tryImportLocal(rs, is, attrs["ref"]); err == nil {
                        return newLocalImporter(dt), ocispec.Descriptor{}, nil
                }
                return upstream(ctx, group, attrs)
        }
}

func tryImportLocal(rs refstore.Store, is imagestore.Store, refStr string) ([]byte, error) {
        ref, err := reference.ParseNormalizedNamed(refStr)
        if err != nil {
                return nil, err
        }
        dgst, err := rs.Get(ref)
        if err != nil {
                return nil, err
        }
        img, err := is.Get(imagestore.ID(dgst))
        if err != nil {
                return nil, err
        }

        return img.RawJSON(), nil
}

func newLocalImporter(dt []byte) remotecache.Importer {
        return &localImporter{dt: dt}
}

type localImporter struct {
        dt []byte
}

func (li *localImporter) Resolve(ctx context.Context, _ ocispec.Descriptor, id string, w worker.Worker) (solver.CacheManager, error) {
        cc := v1.NewCacheChains()
        if err := li.importInlineCache(ctx, li.dt, cc); err != nil {
                return nil, err
        }

        keysStorage, resultStorage, err := v1.NewCacheKeyStorage(cc, w)
        if err != nil {
                return nil, err
        }
        return solver.NewCacheManager(ctx, id, keysStorage, resultStorage), nil
}

func (li *localImporter) importInlineCache(ctx context.Context, dt []byte, cc solver.CacheExporterTarget) error {
        var img image

        if err := json.Unmarshal(dt, &img); err != nil {
                return err
        }

        if img.Cache == nil {
                return nil
        }

        var config v1.CacheConfig
        if err := json.Unmarshal(img.Cache, &config.Records); err != nil {
                return err
        }

        createdDates, createdMsg, err := parseCreatedLayerInfo(img)
        if err != nil {
                return err
        }

        layers := v1.DescriptorProvider{}
        for i, diffID := range img.Rootfs.DiffIDs {
                dgst := digest.Digest(diffID.String())
                desc := ocispec.Descriptor{
                        Digest:      dgst,
                        Size:        -1,
                        MediaType:   c8dimages.MediaTypeDockerSchema2Layer,
                        Annotations: map[string]string{},
                }
                if createdAt := createdDates[i]; createdAt != "" {
                        desc.Annotations["buildkit/createdat"] = createdAt
                }
                if createdBy := createdMsg[i]; createdBy != "" {
                        desc.Annotations["buildkit/description"] = createdBy
                }
                desc.Annotations["containerd.io/uncompressed"] = img.Rootfs.DiffIDs[i].String()
                layers[dgst] = v1.DescriptorProviderPair{
                        Descriptor: desc,
                        Provider:   &emptyProvider{},
                }
                config.Layers = append(config.Layers, v1.CacheLayer{
                        Blob:        dgst,
                        ParentIndex: i - 1,
                })
        }

        return v1.ParseConfig(config, layers, cc)
}

type image struct {
        Rootfs struct {
                DiffIDs []digest.Digest `json:"diff_ids"`
        } `json:"rootfs"`
        Cache   []byte `json:"moby.buildkit.cache.v0"`
        History []struct {
                Created    *time.Time `json:"created,omitempty"`
                CreatedBy  string     `json:"created_by,omitempty"`
                EmptyLayer bool       `json:"empty_layer,omitempty"`
        } `json:"history,omitempty"`
}

func parseCreatedLayerInfo(img image) ([]string, []string, error) {
        dates := make([]string, 0, len(img.Rootfs.DiffIDs))
        createdBy := make([]string, 0, len(img.Rootfs.DiffIDs))
        for _, h := range img.History {
                if !h.EmptyLayer {
                        str := ""
                        if h.Created != nil {
                                dt, err := h.Created.MarshalText()
                                if err != nil {
                                        return nil, nil, err
                                }
                                str = string(dt)
                        }
                        dates = append(dates, str)
                        createdBy = append(createdBy, h.CreatedBy)
                }
        }
        return dates, createdBy, nil
}

type emptyProvider struct{}

func (p *emptyProvider) ReaderAt(ctx context.Context, dec ocispec.Descriptor) (content.ReaderAt, error) {
        return nil, errors.Errorf("ReaderAt not implemented for empty provider")
}

package snapshot

import (
        "context"
        "os"
        "path/filepath"

        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/longpath"
        "github.com/opencontainers/image-spec/identity"
        "github.com/pkg/errors"
        bolt "go.etcd.io/bbolt"
        "golang.org/x/sync/errgroup"
)

func (s *snapshotter) GetDiffIDs(ctx context.Context, key string) ([]layer.DiffID, error) {
        if l, err := s.getLayer(key, true); err != nil {
                return nil, err
        } else if l != nil {
                return getDiffChain(l), nil
        }
        return nil, nil
}

func (s *snapshotter) EnsureLayer(ctx context.Context, key string) ([]layer.DiffID, error) {
        s.layerCreateLocker.Lock(key)
        defer s.layerCreateLocker.Unlock(key)

        if diffIDs, err := s.GetDiffIDs(ctx, key); err != nil {
                return nil, err
        } else if diffIDs != nil {
                return diffIDs, nil
        }

        id, committed := s.getGraphDriverID(key)
        if !committed {
                return nil, errors.Errorf("can not convert active %s to layer", key)
        }

        info, err := s.Stat(ctx, key)
        if err != nil {
                return nil, err
        }

        eg, gctx := errgroup.WithContext(ctx)

        // TODO: add flightcontrol

        var parentChainID layer.ChainID
        if info.Parent != "" {
                eg.Go(func() error {
                        diffIDs, err := s.EnsureLayer(gctx, info.Parent)
                        if err != nil {
                                return err
                        }
                        parentChainID = identity.ChainID(diffIDs)
                        return nil
                })
        }

        tmpDir, err := longpath.MkdirTemp("", "docker-tarsplit")
        if err != nil {
                return nil, err
        }
        defer os.RemoveAll(tmpDir)
        tarSplitPath := filepath.Join(tmpDir, "tar-split")

        var diffID layer.DiffID
        var size int64
        eg.Go(func() error {
                parent := ""
                if p := info.Parent; p != "" {
                        if l, err := s.getLayer(p, true); err != nil {
                                return err
                        } else if l != nil {
                                parent, err = getGraphID(l)
                                if err != nil {
                                        return err
                                }
                        } else {
                                parent, _ = s.getGraphDriverID(info.Parent)
                        }
                }
                diffID, size, err = s.reg.ChecksumForGraphID(id, parent, tarSplitPath)
                return err
        })

        if err := eg.Wait(); err != nil {
                return nil, err
        }

        l, err := s.reg.RegisterByGraphID(id, parentChainID, diffID, tarSplitPath, size)
        if err != nil {
                return nil, err
        }

        if err := s.db.Update(func(tx *bolt.Tx) error {
                b := tx.Bucket([]byte(key))
                b.Put(keyChainID, []byte(l.ChainID()))
                return nil
        }); err != nil {
                return nil, err
        }

        s.mu.Lock()
        s.refs[key] = l
        s.mu.Unlock()

        return getDiffChain(l), nil
}

func getDiffChain(l layer.Layer) []layer.DiffID {
        if p := l.Parent(); p != nil {
                return append(getDiffChain(p), l.DiffID())
        }
        return []layer.DiffID{l.DiffID()}
}

func getGraphID(l layer.Layer) (string, error) {
        if l, ok := l.(interface {
                CacheID() string
        }); ok {
                return l.CacheID(), nil
        }
        return "", errors.Errorf("couldn't access cacheID for %s", l.ChainID())
}

package snapshot

import (
        "context"
        "sync"

        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/log"
        bolt "go.etcd.io/bbolt"
)

type sLM struct {
        manager leases.Manager
        s       *snapshotter

        mu         sync.Mutex
        byLease    map[string]map[string]struct{}
        bySnapshot map[string]map[string]struct{}
}

func newLeaseManager(s *snapshotter, lm leases.Manager) *sLM {
        return &sLM{
                s:       s,
                manager: lm,

                byLease:    map[string]map[string]struct{}{},
                bySnapshot: map[string]map[string]struct{}{},
        }
}

func (l *sLM) Create(ctx context.Context, opts ...leases.Opt) (leases.Lease, error) {
        return l.manager.Create(ctx, opts...)
}

func (l *sLM) Delete(ctx context.Context, lease leases.Lease, opts ...leases.DeleteOpt) error {
        if err := l.manager.Delete(ctx, lease, opts...); err != nil {
                return err
        }
        l.mu.Lock()
        if snaps, ok := l.byLease[lease.ID]; ok {
                for sID := range snaps {
                        l.delRef(lease.ID, sID)
                }
        }
        l.mu.Unlock()
        return nil
}

func (l *sLM) List(ctx context.Context, filters ...string) ([]leases.Lease, error) {
        return l.manager.List(ctx, filters...)
}

func (l *sLM) AddResource(ctx context.Context, lease leases.Lease, resource leases.Resource) error {
        if err := l.manager.AddResource(ctx, lease, resource); err != nil {
                return err
        }
        if resource.Type == "snapshots/default" {
                l.mu.Lock()
                l.addRef(lease.ID, resource.ID)
                l.mu.Unlock()
        }
        return nil
}

func (l *sLM) DeleteResource(ctx context.Context, lease leases.Lease, resource leases.Resource) error {
        if err := l.manager.DeleteResource(ctx, lease, resource); err != nil {
                return err
        }
        if resource.Type == "snapshots/default" {
                l.mu.Lock()
                l.delRef(lease.ID, resource.ID)
                l.mu.Unlock()
        }
        return nil
}

func (l *sLM) ListResources(ctx context.Context, lease leases.Lease) ([]leases.Resource, error) {
        return l.manager.ListResources(ctx, lease)
}

func (l *sLM) addRef(lID, sID string) {
        load := false
        snapshots, ok := l.byLease[lID]
        if !ok {
                snapshots = map[string]struct{}{}
                l.byLease[lID] = snapshots
        }
        if _, ok := snapshots[sID]; !ok {
                snapshots[sID] = struct{}{}
        }
        leases, ok := l.bySnapshot[sID]
        if !ok {
                leases = map[string]struct{}{}
                l.byLease[sID] = leases
                load = true
        }
        if _, ok := leases[lID]; !ok {
                leases[lID] = struct{}{}
        }

        if load {
                l.s.getLayer(sID, true)
                if _, ok := l.s.chainID(sID); ok {
                        l.s.db.Update(func(tx *bolt.Tx) error {
                                b, err := tx.CreateBucketIfNotExists([]byte(lID))
                                if err != nil {
                                        return err
                                }
                                return b.Put(keyChainID, []byte(sID))
                        })
                }
        }
}

func (l *sLM) delRef(lID, sID string) {
        snapshots, ok := l.byLease[lID]
        if !ok {
                delete(snapshots, sID)
                if len(snapshots) == 0 {
                        delete(l.byLease, lID)
                }
        }
        leases, ok := l.bySnapshot[sID]
        if !ok {
                delete(leases, lID)
                if len(leases) == 0 {
                        delete(l.bySnapshot, sID)
                        if err := l.s.remove(context.TODO(), sID); err != nil {
                                log.G(context.TODO()).Warnf("failed to remove snapshot %v", sID)
                        }
                }
        }
}

package snapshot

import (
        "context"
        "path/filepath"
        "strconv"
        "strings"
        "sync"

        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/containerd/v2/core/mount"
        "github.com/containerd/containerd/v2/core/snapshots"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/docker/docker/daemon/graphdriver"
        "github.com/docker/docker/layer"
        "github.com/moby/buildkit/identity"
        "github.com/moby/buildkit/snapshot"
        "github.com/moby/buildkit/util/leaseutil"
        "github.com/moby/locker"
        "github.com/moby/sys/user"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
        bolt "go.etcd.io/bbolt"
)

var (
        keyParent      = []byte("parent")
        keyCommitted   = []byte("committed")
        keyIsCommitted = []byte("iscommitted")
        keyChainID     = []byte("chainid")
        keySize        = []byte("size")
)

// Opt defines options for creating the snapshotter
type Opt struct {
        GraphDriver     graphdriver.Driver
        LayerStore      layer.Store
        Root            string
        IdentityMapping user.IdentityMapping
}

type graphIDRegistrar interface {
        RegisterByGraphID(string, layer.ChainID, layer.DiffID, string, int64) (layer.Layer, error)
        Release(layer.Layer) ([]layer.Metadata, error)
        checksumCalculator
}

type checksumCalculator interface {
        ChecksumForGraphID(id, parent, newTarDataPath string) (diffID layer.DiffID, size int64, err error)
}

type snapshotter struct {
        opt Opt

        refs              map[string]layer.Layer
        db                *bolt.DB
        mu                sync.Mutex
        reg               graphIDRegistrar
        layerCreateLocker *locker.Locker
}

// NewSnapshotter creates a new snapshotter
func NewSnapshotter(opt Opt, prevLM leases.Manager, ns string) (snapshot.Snapshotter, *leaseutil.Manager, error) {
        dbPath := filepath.Join(opt.Root, "snapshots.db")
        db, err := bolt.Open(dbPath, 0o600, nil)
        if err != nil {
                return nil, nil, errors.Wrapf(err, "failed to open database file %s", dbPath)
        }

        reg, ok := opt.LayerStore.(graphIDRegistrar)
        if !ok {
                return nil, nil, errors.Errorf("layerstore doesn't support graphID registration")
        }

        s := &snapshotter{
                opt:               opt,
                db:                db,
                refs:              map[string]layer.Layer{},
                reg:               reg,
                layerCreateLocker: locker.New(),
        }

        slm := newLeaseManager(s, prevLM)
        lm := leaseutil.WithNamespace(slm, ns)

        ll, err := lm.List(context.TODO())
        if err != nil {
                return nil, nil, err
        }
        for _, l := range ll {
                rr, err := lm.ListResources(context.TODO(), l)
                if err != nil {
                        return nil, nil, err
                }
                for _, r := range rr {
                        if r.Type == "snapshots/default" {
                                slm.addRef(l.ID, r.ID)
                        }
                }
        }

        return s, lm, nil
}

func (s *snapshotter) Name() string {
        return "default"
}

func (s *snapshotter) IdentityMapping() *user.IdentityMapping {
        // Returning a non-nil but empty *IdentityMapping breaks BuildKit:
        // https://github.com/moby/moby/pull/39444
        if s.opt.IdentityMapping.Empty() {
                return nil
        }
        return &s.opt.IdentityMapping
}

func (s *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) error {
        origParent := parent
        if parent != "" {
                if l, err := s.getLayer(parent, false); err != nil {
                        return errors.Wrapf(err, "failed to get parent layer %s", parent)
                } else if l != nil {
                        parent, err = getGraphID(l)
                        if err != nil {
                                return errors.Wrapf(err, "failed to get parent graphid %s", l.ChainID())
                        }
                } else {
                        parent, _ = s.getGraphDriverID(parent)
                }
        }
        if err := s.opt.GraphDriver.Create(key, parent, nil); err != nil {
                return err
        }
        return s.db.Update(func(tx *bolt.Tx) error {
                b, err := tx.CreateBucketIfNotExists([]byte(key))
                if err != nil {
                        return err
                }
                return b.Put(keyParent, []byte(origParent))
        })
}

func (s *snapshotter) chainID(key string) (layer.ChainID, bool) {
        if strings.HasPrefix(key, "sha256:") {
                dgst, err := digest.Parse(key)
                if err != nil {
                        return "", false
                }
                return dgst, true
        }
        return "", false
}

func (s *snapshotter) GetLayer(key string) (layer.Layer, error) {
        return s.getLayer(key, true)
}

func (s *snapshotter) getLayer(key string, withCommitted bool) (layer.Layer, error) {
        s.mu.Lock()
        l, ok := s.refs[key]
        if !ok {
                id, ok := s.chainID(key)
                if !ok {
                        if !withCommitted {
                                s.mu.Unlock()
                                return nil, nil
                        }
                        if err := s.db.View(func(tx *bolt.Tx) error {
                                b := tx.Bucket([]byte(key))
                                if b == nil {
                                        return nil
                                }
                                v := b.Get(keyChainID)
                                if v != nil {
                                        id = layer.ChainID(v)
                                }
                                return nil
                        }); err != nil {
                                s.mu.Unlock()
                                return nil, errors.WithStack(err)
                        }
                        s.mu.Unlock()
                        if id == "" {
                                return nil, nil
                        }
                        return s.getLayer(string(id), withCommitted)
                }
                var err error
                l, err = s.opt.LayerStore.Get(id)
                if err != nil {
                        s.mu.Unlock()
                        return nil, errors.WithStack(err)
                }
                s.refs[key] = l
                if err := s.db.Update(func(tx *bolt.Tx) error {
                        _, err := tx.CreateBucketIfNotExists([]byte(key))
                        return errors.WithStack(err)
                }); err != nil {
                        s.mu.Unlock()
                        return nil, err
                }
        }
        s.mu.Unlock()

        return l, nil
}

func (s *snapshotter) getGraphDriverID(key string) (string, bool) {
        var gdID string
        if err := s.db.View(func(tx *bolt.Tx) error {
                b := tx.Bucket([]byte(key))
                if b == nil {
                        return errors.Wrapf(cerrdefs.ErrNotFound, "key %s", key)
                }
                v := b.Get(keyCommitted)
                if v != nil {
                        gdID = string(v)
                }
                return nil
        }); err != nil || gdID == "" {
                return key, false
        }
        return gdID, true
}

func (s *snapshotter) Stat(ctx context.Context, key string) (snapshots.Info, error) {
        inf := snapshots.Info{
                Kind: snapshots.KindActive,
        }

        l, err := s.getLayer(key, false)
        if err != nil {
                return snapshots.Info{}, err
        }
        if l != nil {
                if p := l.Parent(); p != nil {
                        inf.Parent = p.ChainID().String()
                }
                inf.Kind = snapshots.KindCommitted
                inf.Name = key
                return inf, nil
        }

        l, err = s.getLayer(key, true)
        if err != nil {
                return snapshots.Info{}, err
        }

        id, committed := s.getGraphDriverID(key)
        if committed {
                inf.Kind = snapshots.KindCommitted
        }

        if err := s.db.View(func(tx *bolt.Tx) error {
                b := tx.Bucket([]byte(id))
                if b == nil && l == nil {
                        return errors.Wrapf(cerrdefs.ErrNotFound, "snapshot %s", id)
                }
                inf.Name = key
                if b != nil {
                        v := b.Get(keyParent)
                        if v != nil {
                                inf.Parent = string(v)
                                return nil
                        }
                }
                if l != nil {
                        if p := l.Parent(); p != nil {
                                inf.Parent = p.ChainID().String()
                        }
                        inf.Kind = snapshots.KindCommitted
                }
                return nil
        }); err != nil {
                return snapshots.Info{}, err
        }
        return inf, nil
}

func (s *snapshotter) Mounts(ctx context.Context, key string) (snapshot.Mountable, error) {
        l, err := s.getLayer(key, true)
        if err != nil {
                return nil, err
        }
        if l != nil {
                id := identity.NewID()
                var rwlayer layer.RWLayer
                return &mountable{
                        idmap: s.opt.IdentityMapping,
                        acquire: func() ([]mount.Mount, func() error, error) {
                                rwlayer, err = s.opt.LayerStore.CreateRWLayer(id, l.ChainID(), nil)
                                if err != nil {
                                        return nil, nil, err
                                }
                                rootfs, err := rwlayer.Mount("")
                                if err != nil {
                                        return nil, nil, err
                                }
                                return []mount.Mount{{
                                                Source:  rootfs,
                                                Type:    "bind",
                                                Options: []string{"rbind"},
                                        }}, func() error {
                                                _, err := s.opt.LayerStore.ReleaseRWLayer(rwlayer)
                                                return err
                                        }, nil
                        },
                }, nil
        }

        id, _ := s.getGraphDriverID(key)

        return &mountable{
                idmap: s.opt.IdentityMapping,
                acquire: func() ([]mount.Mount, func() error, error) {
                        rootfs, err := s.opt.GraphDriver.Get(id, "")
                        if err != nil {
                                return nil, nil, err
                        }
                        return []mount.Mount{{
                                        Source:  rootfs,
                                        Type:    "bind",
                                        Options: []string{"rbind"},
                                }}, func() error {
                                        return s.opt.GraphDriver.Put(id)
                                }, nil
                },
        }, nil
}

func (s *snapshotter) Remove(ctx context.Context, key string) error {
        return errors.Errorf("calling snapshot.remove is forbidden")
}

func (s *snapshotter) remove(ctx context.Context, key string) error {
        l, err := s.getLayer(key, true)
        if err != nil {
                return err
        }

        id, _ := s.getGraphDriverID(key)

        var found bool
        var alreadyCommitted bool
        if err := s.db.Update(func(tx *bolt.Tx) error {
                b := tx.Bucket([]byte(key))
                found = b != nil

                if b != nil {
                        if b.Get(keyIsCommitted) != nil {
                                alreadyCommitted = true
                                return nil
                        }
                }
                if found {
                        tx.DeleteBucket([]byte(key))
                        if id != key {
                                tx.DeleteBucket([]byte(id))
                        }
                }
                return nil
        }); err != nil {
                return err
        }

        if alreadyCommitted {
                return nil
        }

        if l != nil {
                s.mu.Lock()
                delete(s.refs, key)
                s.mu.Unlock()
                _, err := s.opt.LayerStore.Release(l)
                return err
        }

        if !found { // this happens when removing views
                return nil
        }

        return s.opt.GraphDriver.Remove(id)
}

func (s *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error {
        return s.db.Update(func(tx *bolt.Tx) error {
                b, err := tx.CreateBucketIfNotExists([]byte(name))
                if err != nil {
                        return err
                }
                if err := b.Put(keyCommitted, []byte(key)); err != nil {
                        return err
                }
                b, err = tx.CreateBucketIfNotExists([]byte(key))
                if err != nil {
                        return err
                }
                return b.Put(keyIsCommitted, []byte{})
        })
}

func (s *snapshotter) View(ctx context.Context, key, parent string, opts ...snapshots.Opt) (snapshot.Mountable, error) {
        return s.Mounts(ctx, parent)
}

func (s *snapshotter) Walk(context.Context, snapshots.WalkFunc, ...string) error {
        return nil
}

func (s *snapshotter) Update(ctx context.Context, info snapshots.Info, fieldpaths ...string) (snapshots.Info, error) {
        // not implemented
        return s.Stat(ctx, info.Name)
}

func (s *snapshotter) Usage(ctx context.Context, key string) (us snapshots.Usage, retErr error) {
        usage := snapshots.Usage{}
        if l, err := s.getLayer(key, true); err != nil {
                return usage, err
        } else if l != nil {
                usage.Size = l.DiffSize()
                return usage, nil
        }

        size := int64(-1)
        if err := s.db.View(func(tx *bolt.Tx) error {
                b := tx.Bucket([]byte(key))
                if b == nil {
                        return nil
                }
                v := b.Get(keySize)
                if v != nil {
                        s, err := strconv.Atoi(string(v))
                        if err != nil {
                                return err
                        }
                        size = int64(s)
                }
                return nil
        }); err != nil {
                return usage, err
        }

        if size != -1 {
                usage.Size = size
                return usage, nil
        }

        id, _ := s.getGraphDriverID(key)

        info, err := s.Stat(ctx, key)
        if err != nil {
                return usage, err
        }
        var parent string
        if info.Parent != "" {
                if l, err := s.getLayer(info.Parent, false); err != nil {
                        return usage, err
                } else if l != nil {
                        parent, err = getGraphID(l)
                        if err != nil {
                                return usage, err
                        }
                } else {
                        parent, _ = s.getGraphDriverID(info.Parent)
                }
        }

        diffSize, err := s.opt.GraphDriver.DiffSize(id, parent)
        if err != nil {
                return usage, err
        }

        if err := s.db.Update(func(tx *bolt.Tx) error {
                b, err := tx.CreateBucketIfNotExists([]byte(key))
                if err != nil {
                        return err
                }
                return b.Put(keySize, []byte(strconv.Itoa(int(diffSize))))
        }); err != nil {
                return usage, err
        }
        usage.Size = diffSize
        return usage, nil
}

func (s *snapshotter) Close() error {
        return s.db.Close()
}

type mountable struct {
        mu       sync.Mutex
        mounts   []mount.Mount
        acquire  func() ([]mount.Mount, func() error, error)
        release  func() error
        refCount int
        idmap    user.IdentityMapping
}

func (m *mountable) Mount() ([]mount.Mount, func() error, error) {
        m.mu.Lock()
        defer m.mu.Unlock()

        if m.mounts != nil {
                m.refCount++
                return m.mounts, m.releaseMount, nil
        }

        mounts, release, err := m.acquire()
        if err != nil {
                return nil, nil, err
        }
        m.mounts = mounts
        m.release = release
        m.refCount = 1

        return m.mounts, m.releaseMount, nil
}

func (m *mountable) releaseMount() error {
        m.mu.Lock()
        defer m.mu.Unlock()

        if m.refCount > 1 {
                m.refCount--
                return nil
        }

        m.refCount = 0
        if m.release == nil {
                return nil
        }

        m.mounts = nil
        defer func() {
                m.release = nil
        }()
        return m.release()
}

func (m *mountable) IdentityMapping() *user.IdentityMapping {
        // Returning a non-nil but empty *IdentityMapping breaks BuildKit:
        // https://github.com/moby/moby/pull/39444
        if m.idmap.Empty() {
                return nil
        }
        return &m.idmap
}

package buildkit

import (
        "context"
        "fmt"
        "io"
        "net/netip"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/platforms"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/daemon/config"
        "github.com/docker/docker/daemon/images"
        "github.com/docker/docker/daemon/internal/builder-next/exporter"
        "github.com/docker/docker/daemon/internal/builder-next/exporter/mobyexporter"
        "github.com/docker/docker/daemon/internal/builder-next/exporter/overrides"
        "github.com/docker/docker/daemon/libnetwork"
        "github.com/docker/docker/daemon/pkg/opts"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/pkg/streamformatter"
        controlapi "github.com/moby/buildkit/api/services/control"
        "github.com/moby/buildkit/client"
        "github.com/moby/buildkit/control"
        "github.com/moby/buildkit/identity"
        "github.com/moby/buildkit/session"
        "github.com/moby/buildkit/util/entitlements"
        "github.com/moby/buildkit/util/tracing"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/build"
        "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/network"
        timetypes "github.com/moby/moby/api/types/time"
        "github.com/moby/sys/user"
        "github.com/pkg/errors"
        "golang.org/x/sync/errgroup"
        "google.golang.org/grpc"
        grpcmetadata "google.golang.org/grpc/metadata"
        "google.golang.org/protobuf/proto"
        "tags.cncf.io/container-device-interface/pkg/cdi"
)

type errMultipleFilterValues struct{}

func (errMultipleFilterValues) Error() string { return "filters expect only one value" }

func (errMultipleFilterValues) InvalidParameter() {}

type errConflictFilter struct {
        a, b string
}

func (e errConflictFilter) Error() string {
        return fmt.Sprintf("conflicting filters: %q and %q", e.a, e.b)
}

func (errConflictFilter) InvalidParameter() {}

type errInvalidFilterValue struct {
        error
}

func (errInvalidFilterValue) InvalidParameter() {}

var cacheFields = map[string]bool{
        "id":          true,
        "parent":      true,
        "type":        true,
        "description": true,
        "inuse":       true,
        "shared":      true,
        "private":     true,
        // fields from buildkit that are not exposed
        "mutable":   false,
        "immutable": false,
}

// Opt is option struct required for creating the builder
type Opt struct {
        SessionManager      *session.Manager
        Root                string
        EngineID            string
        Dist                images.DistributionServices
        ImageTagger         mobyexporter.ImageTagger
        NetworkController   *libnetwork.Controller
        DefaultCgroupParent string
        RegistryHosts       docker.RegistryHosts
        BuilderConfig       config.BuilderConfig
        Rootless            bool
        IdentityMapping     user.IdentityMapping
        DNSConfig           config.DNSConfig
        ApparmorProfile     string
        UseSnapshotter      bool
        Snapshotter         string
        ContainerdAddress   string
        ContainerdNamespace string
        Callbacks           exporter.BuildkitCallbacks
        CDICache            *cdi.Cache
}

// Builder can build using BuildKit backend
type Builder struct {
        controller     *control.Controller
        dnsconfig      config.DNSConfig
        reqBodyHandler *reqBodyHandler

        mu             sync.Mutex
        jobs           map[string]*buildJob
        useSnapshotter bool
}

// New creates a new builder
func New(ctx context.Context, opt Opt) (*Builder, error) {
        reqHandler := newReqBodyHandler(tracing.DefaultTransport)

        c, err := newController(ctx, reqHandler, opt)
        if err != nil {
                return nil, err
        }
        b := &Builder{
                controller:     c,
                dnsconfig:      opt.DNSConfig,
                reqBodyHandler: reqHandler,
                jobs:           map[string]*buildJob{},
                useSnapshotter: opt.UseSnapshotter,
        }
        return b, nil
}

func (b *Builder) Close() error {
        return b.controller.Close()
}

// RegisterGRPC registers controller to the grpc server.
func (b *Builder) RegisterGRPC(s *grpc.Server) {
        b.controller.Register(s)
}

// Cancel cancels a build using ID
func (b *Builder) Cancel(ctx context.Context, id string) error {
        b.mu.Lock()
        if j, ok := b.jobs[id]; ok && j.cancel != nil {
                j.cancel()
        }
        b.mu.Unlock()
        return nil
}

// DiskUsage returns a report about space used by build cache
func (b *Builder) DiskUsage(ctx context.Context) ([]*build.CacheRecord, error) {
        duResp, err := b.controller.DiskUsage(ctx, &controlapi.DiskUsageRequest{})
        if err != nil {
                return nil, err
        }

        var items []*build.CacheRecord
        for _, r := range duResp.Record {
                items = append(items, &build.CacheRecord{
                        ID:          r.ID,
                        Parent:      r.Parent, //nolint:staticcheck // ignore SA1019 (Parent field is deprecated)
                        Parents:     r.Parents,
                        Type:        r.RecordType,
                        Description: r.Description,
                        InUse:       r.InUse,
                        Shared:      r.Shared,
                        Size:        r.Size,
                        CreatedAt: func() time.Time {
                                if r.CreatedAt != nil {
                                        return r.CreatedAt.AsTime()
                                }
                                return time.Time{}
                        }(),
                        LastUsedAt: func() *time.Time {
                                if r.LastUsedAt == nil {
                                        return nil
                                }
                                t := r.LastUsedAt.AsTime()
                                return &t
                        }(),
                        UsageCount: int(r.UsageCount),
                })
        }
        return items, nil
}

// Prune clears all reclaimable build cache.
func (b *Builder) Prune(ctx context.Context, opts build.CachePruneOptions) (int64, []string, error) {
        ch := make(chan *controlapi.UsageRecord)

        eg, ctx := errgroup.WithContext(ctx)

        validFilters := make(map[string]bool, 1+len(cacheFields))
        validFilters["unused-for"] = true
        validFilters["until"] = true
        validFilters["label"] = true  // TODO(tiborvass): handle label
        validFilters["label!"] = true // TODO(tiborvass): handle label!
        for k, v := range cacheFields {
                validFilters[k] = v
        }
        if err := opts.Filters.Validate(validFilters); err != nil {
                return 0, nil, err
        }

        pi, err := toBuildkitPruneInfo(opts)
        if err != nil {
                return 0, nil, err
        }

        eg.Go(func() error {
                defer close(ch)
                return b.controller.Prune(&controlapi.PruneRequest{
                        All:           pi.All,
                        KeepDuration:  int64(pi.KeepDuration),
                        ReservedSpace: pi.ReservedSpace,
                        MaxUsedSpace:  pi.MaxUsedSpace,
                        MinFreeSpace:  pi.MinFreeSpace,
                        Filter:        pi.Filter,
                }, &pruneProxy{
                        streamProxy: streamProxy{ctx: ctx},
                        ch:          ch,
                })
        })

        var size int64
        var cacheIDs []string
        eg.Go(func() error {
                for r := range ch {
                        size += r.Size
                        cacheIDs = append(cacheIDs, r.ID)
                }
                return nil
        })

        if err := eg.Wait(); err != nil {
                return 0, nil, err
        }

        return size, cacheIDs, nil
}

// Build executes a build request
func (b *Builder) Build(ctx context.Context, opt backend.BuildConfig) (*builder.Result, error) {
        if len(opt.Options.Outputs) > 1 {
                return nil, errors.Errorf("multiple outputs not supported")
        }

        rc := opt.Source
        if buildID := opt.Options.BuildID; buildID != "" {
                b.mu.Lock()

                upload := false
                if strings.HasPrefix(buildID, "upload-request:") {
                        upload = true
                        buildID = strings.TrimPrefix(buildID, "upload-request:")
                }

                if _, ok := b.jobs[buildID]; !ok {
                        b.jobs[buildID] = newBuildJob()
                }
                j := b.jobs[buildID]
                var cancel func()
                ctx, cancel = context.WithCancel(ctx)
                j.cancel = cancel
                b.mu.Unlock()

                if upload {
                        ctx2, cancel := context.WithTimeout(ctx, 5*time.Second)
                        defer cancel()
                        err := j.SetUpload(ctx2, rc)
                        return nil, err
                }

                if remoteContext := opt.Options.RemoteContext; remoteContext == "upload-request" {
                        ctx2, cancel := context.WithTimeout(ctx, 5*time.Second)
                        defer cancel()
                        var err error
                        rc, err = j.WaitUpload(ctx2)
                        if err != nil {
                                return nil, err
                        }
                        opt.Options.RemoteContext = ""
                }

                defer func() {
                        b.mu.Lock()
                        delete(b.jobs, buildID)
                        b.mu.Unlock()
                }()
        }

        var out builder.Result

        frontendAttrs := map[string]string{}

        if opt.Options.Target != "" {
                frontendAttrs["target"] = opt.Options.Target
        }

        if opt.Options.Dockerfile != "" && opt.Options.Dockerfile != "." {
                frontendAttrs["filename"] = opt.Options.Dockerfile
        }

        if opt.Options.RemoteContext != "" {
                if opt.Options.RemoteContext != "client-session" {
                        frontendAttrs["context"] = opt.Options.RemoteContext
                }
        } else {
                url, cancel := b.reqBodyHandler.newRequest(rc)
                defer cancel()
                frontendAttrs["context"] = url
        }

        cacheFrom := append([]string{}, opt.Options.CacheFrom...)

        frontendAttrs["cache-from"] = strings.Join(cacheFrom, ",")

        for k, v := range opt.Options.BuildArgs {
                if v == nil {
                        continue
                }
                frontendAttrs["build-arg:"+k] = *v
        }

        for k, v := range opt.Options.Labels {
                frontendAttrs["label:"+k] = v
        }

        if opt.Options.NoCache {
                frontendAttrs["no-cache"] = ""
        }

        if opt.Options.PullParent {
                frontendAttrs["image-resolve-mode"] = "pull"
        } else {
                frontendAttrs["image-resolve-mode"] = "default"
        }

        if opt.Options.Platform != "" {
                // same as in newBuilder in builder/dockerfile.builder.go
                // TODO: remove once opt.Options.Platform is of type specs.Platform
                _, err := platforms.Parse(opt.Options.Platform)
                if err != nil {
                        return nil, errdefs.InvalidParameter(err)
                }
                frontendAttrs["platform"] = opt.Options.Platform
        }

        switch opt.Options.NetworkMode {
        case network.NetworkHost, network.NetworkNone:
                frontendAttrs["force-network-mode"] = opt.Options.NetworkMode
        case "", network.NetworkDefault:
        default:
                return nil, errors.Errorf("network mode %q not supported by buildkit", opt.Options.NetworkMode)
        }

        extraHosts, err := toBuildkitExtraHosts(opt.Options.ExtraHosts, b.dnsconfig.HostGatewayIPs)
        if err != nil {
                return nil, err
        }
        frontendAttrs["add-hosts"] = extraHosts

        if opt.Options.ShmSize > 0 {
                frontendAttrs["shm-size"] = strconv.FormatInt(opt.Options.ShmSize, 10)
        }

        ulimits, err := toBuildkitUlimits(opt.Options.Ulimits)
        if err != nil {
                return nil, err
        } else if ulimits != "" {
                frontendAttrs["ulimit"] = ulimits
        }

        exporterName := ""
        exporterAttrs := map[string]string{}
        if len(opt.Options.Outputs) == 0 {
                exporterName = exporter.Moby
        } else {
                // cacheonly is a special type for triggering skipping all exporters
                if opt.Options.Outputs[0].Type != "cacheonly" {
                        exporterName = opt.Options.Outputs[0].Type
                        exporterAttrs = opt.Options.Outputs[0].Attrs
                }
        }

        if (exporterName == client.ExporterImage || exporterName == exporter.Moby) && len(opt.Options.Tags) > 0 {
                nameAttr, err := overrides.SanitizeRepoAndTags(opt.Options.Tags)
                if err != nil {
                        return nil, err
                }
                if exporterAttrs == nil {
                        exporterAttrs = make(map[string]string)
                }
                exporterAttrs["name"] = strings.Join(nameAttr, ",")
        }

        cache := &controlapi.CacheOptions{}
        if inlineCache := opt.Options.BuildArgs["BUILDKIT_INLINE_CACHE"]; inlineCache != nil {
                if b, err := strconv.ParseBool(*inlineCache); err == nil && b {
                        cache.Exports = append(cache.Exports, &controlapi.CacheOptionsEntry{
                                Type: "inline",
                        })
                }
        }

        id := identity.NewID()
        req := &controlapi.SolveRequest{
                Ref: id,
                Exporters: []*controlapi.Exporter{
                        {Type: exporterName, Attrs: exporterAttrs},
                },
                Frontend:      "dockerfile.v0",
                FrontendAttrs: frontendAttrs,
                Session:       opt.Options.SessionID,
                Cache:         cache,
        }

        if opt.Options.NetworkMode == network.NetworkHost {
                req.Entitlements = append(req.Entitlements, string(entitlements.EntitlementNetworkHost))
        }

        aux := streamformatter.AuxFormatter{Writer: opt.ProgressWriter.Output}

        eg, ctx := errgroup.WithContext(ctx)

        eg.Go(func() error {
                resp, err := b.controller.Solve(ctx, req)
                if err != nil {
                        return err
                }
                if exporterName != exporter.Moby && exporterName != client.ExporterImage {
                        return nil
                }
                imgID, ok := resp.ExporterResponse["containerimage.digest"]
                if !ok {
                        return errors.Errorf("missing image id")
                }
                out.ImageID = imgID
                return aux.Emit("moby.image.id", build.Result{ID: imgID})
        })

        ch := make(chan *controlapi.StatusResponse)

        eg.Go(func() error {
                defer close(ch)
                // streamProxy.ctx is not set to ctx because when request is cancelled,
                // only the build request has to be cancelled, not the status request.
                stream := &statusProxy{streamProxy: streamProxy{ctx: context.TODO()}, ch: ch}
                return b.controller.Status(&controlapi.StatusRequest{Ref: id}, stream)
        })

        eg.Go(func() error {
                for sr := range ch {
                        dt, err := proto.Marshal(sr)
                        if err != nil {
                                return err
                        }
                        if err := aux.Emit("moby.buildkit.trace", dt); err != nil {
                                return err
                        }
                }
                return nil
        })

        if err := eg.Wait(); err != nil {
                return nil, err
        }

        return &out, nil
}

type streamProxy struct {
        ctx context.Context
}

func (sp *streamProxy) SetHeader(_ grpcmetadata.MD) error {
        return nil
}

func (sp *streamProxy) SendHeader(_ grpcmetadata.MD) error {
        return nil
}

func (sp *streamProxy) SetTrailer(_ grpcmetadata.MD) {
}

func (sp *streamProxy) Context() context.Context {
        return sp.ctx
}

func (sp *streamProxy) RecvMsg(m interface{}) error {
        return io.EOF
}

type statusProxy struct {
        streamProxy
        ch chan *controlapi.StatusResponse
}

func (sp *statusProxy) Send(resp *controlapi.StatusResponse) error {
        return sp.SendMsg(resp)
}

func (sp *statusProxy) SendMsg(m interface{}) error {
        if sr, ok := m.(*controlapi.StatusResponse); ok {
                sp.ch <- sr
        }
        return nil
}

type pruneProxy struct {
        streamProxy
        ch chan *controlapi.UsageRecord
}

func (sp *pruneProxy) Send(resp *controlapi.UsageRecord) error {
        return sp.SendMsg(resp)
}

func (sp *pruneProxy) SendMsg(m interface{}) error {
        if sr, ok := m.(*controlapi.UsageRecord); ok {
                sp.ch <- sr
        }
        return nil
}

type wrapRC struct {
        io.ReadCloser
        once   sync.Once
        err    error
        waitCh chan struct{}
}

func (w *wrapRC) Read(b []byte) (int, error) {
        n, err := w.ReadCloser.Read(b)
        if err != nil {
                switch err {
                case io.EOF:
                        w.close(nil)
                default:
                        w.close(err)
                }
        }
        return n, err
}

func (w *wrapRC) Close() error {
        err := w.ReadCloser.Close()
        w.close(err)
        return err
}

func (w *wrapRC) close(err error) {
        w.once.Do(func() {
                w.err = err
                close(w.waitCh)
        })
}

func (w *wrapRC) wait() error {
        <-w.waitCh
        return w.err
}

type buildJob struct {
        cancel func()
        waitCh chan func(io.ReadCloser) error
}

func newBuildJob() *buildJob {
        return &buildJob{waitCh: make(chan func(io.ReadCloser) error)}
}

func (j *buildJob) WaitUpload(ctx context.Context) (io.ReadCloser, error) {
        done := make(chan struct{})

        var upload io.ReadCloser
        fn := func(rc io.ReadCloser) error {
                w := &wrapRC{ReadCloser: rc, waitCh: make(chan struct{})}
                upload = w
                close(done)
                return w.wait()
        }

        select {
        case <-ctx.Done():
                return nil, ctx.Err()
        case j.waitCh <- fn:
                <-done
                return upload, nil
        }
}

func (j *buildJob) SetUpload(ctx context.Context, rc io.ReadCloser) error {
        select {
        case <-ctx.Done():
                return ctx.Err()
        case fn := <-j.waitCh:
                return fn(rc)
        }
}

// toBuildkitExtraHosts converts hosts from docker key:value format to buildkit's csv format
func toBuildkitExtraHosts(inp []string, hostGatewayIPs []netip.Addr) (string, error) {
        if len(inp) == 0 {
                return "", nil
        }
        hosts := make([]string, 0, len(inp))
        for _, h := range inp {
                host, ip, ok := strings.Cut(h, ":")
                if !ok || host == "" || ip == "" {
                        return "", errors.Errorf("invalid host %s", h)
                }
                // If the IP Address is a "host-gateway", replace this value with the
                // IP address(es) stored in the daemon level HostGatewayIPs config variable.
                if ip == opts.HostGatewayName {
                        if len(hostGatewayIPs) == 0 {
                                return "", errors.New("unable to derive the IP value for host-gateway")
                        }
                        for _, gip := range hostGatewayIPs {
                                hosts = append(hosts, host+"="+gip.String())
                        }
                } else {
                        hosts = append(hosts, host+"="+ip)
                }
        }
        return strings.Join(hosts, ","), nil
}

// toBuildkitUlimits converts ulimits from docker type=soft:hard format to buildkit's csv format
func toBuildkitUlimits(inp []*container.Ulimit) (string, error) {
        if len(inp) == 0 {
                return "", nil
        }
        ulimits := make([]string, 0, len(inp))
        for _, ulimit := range inp {
                ulimits = append(ulimits, ulimit.String())
        }
        return strings.Join(ulimits, ","), nil
}

func toBuildkitPruneInfo(opts build.CachePruneOptions) (client.PruneInfo, error) {
        var until time.Duration
        untilValues := opts.Filters.Get("until")          // canonical
        unusedForValues := opts.Filters.Get("unused-for") // deprecated synonym for "until" filter

        if len(untilValues) > 0 && len(unusedForValues) > 0 {
                return client.PruneInfo{}, errConflictFilter{"until", "unused-for"}
        }
        filterKey := "until"
        if len(unusedForValues) > 0 {
                filterKey = "unused-for"
        }
        untilValues = append(untilValues, unusedForValues...)

        switch len(untilValues) {
        case 0:
                // nothing to do
        case 1:
                ts, err := timetypes.GetTimestamp(untilValues[0], time.Now())
                if err != nil {
                        return client.PruneInfo{}, errInvalidFilterValue{
                                errors.Wrapf(err, "%q filter expects a duration (e.g., '24h') or a timestamp", filterKey),
                        }
                }
                seconds, nanoseconds, err := timetypes.ParseTimestamps(ts, 0)
                if err != nil {
                        return client.PruneInfo{}, errInvalidFilterValue{
                                errors.Wrapf(err, "failed to parse timestamp %q", ts),
                        }
                }

                until = time.Since(time.Unix(seconds, nanoseconds))
        default:
                return client.PruneInfo{}, errMultipleFilterValues{}
        }

        bkFilter := make([]string, 0, opts.Filters.Len())
        for cacheField := range cacheFields {
                if opts.Filters.Contains(cacheField) {
                        values := opts.Filters.Get(cacheField)
                        switch len(values) {
                        case 0:
                                bkFilter = append(bkFilter, cacheField)
                        case 1:
                                if cacheField == "id" {
                                        bkFilter = append(bkFilter, cacheField+"~="+values[0])
                                } else {
                                        bkFilter = append(bkFilter, cacheField+"=="+values[0])
                                }
                        default:
                                return client.PruneInfo{}, errMultipleFilterValues{}
                        }
                }
        }

        if opts.ReservedSpace == 0 && opts.KeepStorage != 0 {
                opts.ReservedSpace = opts.KeepStorage
        }

        return client.PruneInfo{
                All:           opts.All,
                KeepDuration:  until,
                ReservedSpace: opts.ReservedSpace,
                MaxUsedSpace:  opts.MaxUsedSpace,
                MinFreeSpace:  opts.MinFreeSpace,
                Filter:        []string{strings.Join(bkFilter, ",")},
        }, nil
}

package buildkit

import (
        "context"
        "fmt"
        "net/http"
        "os"
        "path/filepath"
        "runtime"
        "strings"
        "time"

        ctd "github.com/containerd/containerd/v2/client"
        ctdmetadata "github.com/containerd/containerd/v2/core/metadata"
        "github.com/containerd/containerd/v2/core/snapshots"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/docker/docker/daemon/config"
        "github.com/docker/docker/daemon/graphdriver"
        "github.com/docker/docker/daemon/internal/builder-next/adapters/containerimage"
        "github.com/docker/docker/daemon/internal/builder-next/adapters/localinlinecache"
        "github.com/docker/docker/daemon/internal/builder-next/adapters/snapshot"
        "github.com/docker/docker/daemon/internal/builder-next/exporter/mobyexporter"
        "github.com/docker/docker/daemon/internal/builder-next/imagerefchecker"
        mobyworker "github.com/docker/docker/daemon/internal/builder-next/worker"
        wlabel "github.com/docker/docker/daemon/internal/builder-next/worker/label"
        "github.com/docker/go-units"
        "github.com/moby/buildkit/cache"
        "github.com/moby/buildkit/cache/metadata"
        "github.com/moby/buildkit/cache/remotecache"
        "github.com/moby/buildkit/cache/remotecache/gha"
        inlineremotecache "github.com/moby/buildkit/cache/remotecache/inline"
        localremotecache "github.com/moby/buildkit/cache/remotecache/local"
        registryremotecache "github.com/moby/buildkit/cache/remotecache/registry"
        "github.com/moby/buildkit/client"
        bkconfig "github.com/moby/buildkit/cmd/buildkitd/config"
        "github.com/moby/buildkit/control"
        "github.com/moby/buildkit/frontend"
        dockerfile "github.com/moby/buildkit/frontend/dockerfile/builder"
        "github.com/moby/buildkit/frontend/gateway"
        "github.com/moby/buildkit/frontend/gateway/forwarder"
        containerdsnapshot "github.com/moby/buildkit/snapshot/containerd"
        "github.com/moby/buildkit/solver"
        "github.com/moby/buildkit/solver/bboltcachestorage"
        "github.com/moby/buildkit/solver/llbsolver/cdidevices"
        "github.com/moby/buildkit/solver/pb"
        "github.com/moby/buildkit/util/apicaps"
        "github.com/moby/buildkit/util/archutil"
        "github.com/moby/buildkit/util/entitlements"
        "github.com/moby/buildkit/util/network/netproviders"
        "github.com/moby/buildkit/util/tracing"
        "github.com/moby/buildkit/util/tracing/detect"
        "github.com/moby/buildkit/worker"
        "github.com/moby/buildkit/worker/containerd"
        "github.com/moby/buildkit/worker/label"
        "github.com/moby/moby/api/types/build"
        "github.com/moby/moby/api/types/filters"
        "github.com/pkg/errors"
        bolt "go.etcd.io/bbolt"
        "go.opentelemetry.io/otel/sdk/trace"
)

func newController(ctx context.Context, rt http.RoundTripper, opt Opt) (*control.Controller, error) {
        if opt.UseSnapshotter {
                return newSnapshotterController(ctx, rt, opt)
        }
        return newGraphDriverController(ctx, rt, opt)
}

func getTraceExporter(ctx context.Context) trace.SpanExporter {
        tc := make(tracing.MultiSpanExporter, 0, 2)
        if detect.Recorder != nil {
                tc = append(tc, detect.Recorder)
        }

        if exp, err := detect.NewSpanExporter(ctx); err != nil {
                log.G(ctx).WithError(err).Error("Failed to detect trace exporter for buildkit controller")
        } else if !detect.IsNoneSpanExporter(exp) {
                tc = append(tc, exp)
        }
        return tc
}

func newSnapshotterController(ctx context.Context, rt http.RoundTripper, opt Opt) (*control.Controller, error) {
        if err := os.MkdirAll(opt.Root, 0o711); err != nil {
                return nil, err
        }

        historyDB, historyConf, err := openHistoryDB(opt.Root, "history_c8d.db", opt.BuilderConfig.History)
        if err != nil {
                return nil, err
        }

        cacheStorage, err := bboltcachestorage.NewStore(filepath.Join(opt.Root, "cache.db"))
        if err != nil {
                return nil, err
        }

        nc := netproviders.Opt{
                Mode: "host",
        }

        // HACK! Windows doesn't have 'host' mode networking.
        if runtime.GOOS == "windows" {
                nc = netproviders.Opt{
                        Mode: "auto",
                }
        }

        dns := getDNSConfig(opt.DNSConfig)

        cdiManager, err := getCDIManager(opt)
        if err != nil {
                return nil, err
        }

        workerOpts := containerd.WorkerOptions{
                Root:            opt.Root,
                Address:         opt.ContainerdAddress,
                SnapshotterName: opt.Snapshotter,
                Namespace:       opt.ContainerdNamespace,
                Rootless:        opt.Rootless,
                Labels: map[string]string{
                        label.Snapshotter: opt.Snapshotter,
                },
                DNS:             dns,
                NetworkOpt:      nc,
                ApparmorProfile: opt.ApparmorProfile,
                Selinux:         false,
                CDIManager:      cdiManager,
        }

        wo, err := containerd.NewWorkerOpt(workerOpts, ctd.WithTimeout(60*time.Second))
        if err != nil {
                return nil, err
        }

        policy, err := getGCPolicy(opt.BuilderConfig, opt.Root)
        if err != nil {
                return nil, err
        }

        // make sure platforms are normalized moby/buildkit#4391
        for i, p := range wo.Platforms {
                wo.Platforms[i] = platforms.Normalize(p)
        }

        wo.GCPolicy = policy
        wo.RegistryHosts = opt.RegistryHosts
        wo.Labels = getLabels(opt, wo.Labels)

        exec, err := newExecutor(
                opt.Root,
                opt.DefaultCgroupParent,
                opt.NetworkController,
                dns,
                opt.Rootless,
                opt.IdentityMapping,
                opt.ApparmorProfile,
                cdiManager,
                opt.ContainerdAddress,
                opt.ContainerdNamespace,
        )
        if err != nil {
                return nil, err
        }
        wo.Executor = exec

        w, err := mobyworker.NewContainerdWorker(ctx, wo, opt.Callbacks, rt)
        if err != nil {
                return nil, err
        }

        wc := &worker.Controller{}

        err = wc.Add(w)
        if err != nil {
                return nil, err
        }

        gwf, err := gateway.NewGatewayFrontend(wc.Infos(), nil)
        if err != nil {
                return nil, err
        }

        frontends := map[string]frontend.Frontend{
                "dockerfile.v0": forwarder.NewGatewayForwarder(wc.Infos(), dockerfile.Build),
                "gateway.v0":    gwf,
        }

        return control.NewController(control.Opt{
                SessionManager:   opt.SessionManager,
                WorkerController: wc,
                Frontends:        frontends,
                CacheManager:     solver.NewCacheManager(ctx, "local", cacheStorage, worker.NewCacheResultStorage(wc)),
                CacheStore:       cacheStorage,
                ResolveCacheImporterFuncs: map[string]remotecache.ResolveCacheImporterFunc{
                        "gha":      gha.ResolveCacheImporterFunc(),
                        "local":    localremotecache.ResolveCacheImporterFunc(opt.SessionManager),
                        "registry": registryremotecache.ResolveCacheImporterFunc(opt.SessionManager, wo.ContentStore, opt.RegistryHosts),
                },
                ResolveCacheExporterFuncs: map[string]remotecache.ResolveCacheExporterFunc{
                        "gha":      gha.ResolveCacheExporterFunc(),
                        "inline":   inlineremotecache.ResolveCacheExporterFunc(),
                        "local":    localremotecache.ResolveCacheExporterFunc(opt.SessionManager),
                        "registry": registryremotecache.ResolveCacheExporterFunc(opt.SessionManager, opt.RegistryHosts),
                },
                Entitlements:   getEntitlements(opt.BuilderConfig),
                HistoryDB:      historyDB,
                HistoryConfig:  historyConf,
                LeaseManager:   wo.LeaseManager,
                ContentStore:   wo.ContentStore,
                TraceCollector: getTraceExporter(ctx),
                GarbageCollect: w.GarbageCollect,
        })
}

func openHistoryDB(root string, fn string, cfg *config.BuilderHistoryConfig) (*bolt.DB, *bkconfig.HistoryConfig, error) {
        db, err := bolt.Open(filepath.Join(root, fn), 0o600, nil)
        if err != nil {
                return nil, nil, err
        }

        var conf *bkconfig.HistoryConfig
        if cfg != nil {
                conf = &bkconfig.HistoryConfig{
                        MaxAge:     cfg.MaxAge,
                        MaxEntries: cfg.MaxEntries,
                }
        }

        return db, conf, nil
}

func newGraphDriverController(ctx context.Context, rt http.RoundTripper, opt Opt) (*control.Controller, error) {
        if err := os.MkdirAll(opt.Root, 0o711); err != nil {
                return nil, err
        }

        dist := opt.Dist
        root := opt.Root

        pb.Caps.Init(apicaps.Cap{
                ID:                pb.CapMergeOp,
                Enabled:           false,
                DisabledReasonMsg: "only enabled with containerd image store backend",
        })

        pb.Caps.Init(apicaps.Cap{
                ID:                pb.CapDiffOp,
                Enabled:           false,
                DisabledReasonMsg: "only enabled with containerd image store backend",
        })

        var driver graphdriver.Driver
        if ls, ok := dist.LayerStore.(interface {
                Driver() graphdriver.Driver
        }); ok {
                driver = ls.Driver()
        } else {
                return nil, errors.Errorf("could not access graphdriver")
        }

        innerStore, err := local.NewStore(filepath.Join(root, "content"))
        if err != nil {
                return nil, err
        }

        db, err := bolt.Open(filepath.Join(root, "containerdmeta.db"), 0o644, nil)
        if err != nil {
                return nil, errors.WithStack(err)
        }

        mdb := ctdmetadata.NewDB(db, innerStore, map[string]snapshots.Snapshotter{})

        store := containerdsnapshot.NewContentStore(mdb.ContentStore(), "buildkit")

        snapshotter, lm, err := snapshot.NewSnapshotter(snapshot.Opt{
                GraphDriver:     driver,
                LayerStore:      dist.LayerStore,
                Root:            root,
                IdentityMapping: opt.IdentityMapping,
        }, ctdmetadata.NewLeaseManager(mdb), "buildkit")
        if err != nil {
                return nil, err
        }

        if err := cache.MigrateV2(context.Background(), filepath.Join(root, "metadata.db"), filepath.Join(root, "metadata_v2.db"), store, snapshotter, lm); err != nil {
                return nil, err
        }

        md, err := metadata.NewStore(filepath.Join(root, "metadata_v2.db"))
        if err != nil {
                return nil, err
        }

        layerGetter, ok := snapshotter.(imagerefchecker.LayerGetter)
        if !ok {
                return nil, errors.Errorf("snapshotter does not implement layergetter")
        }

        refChecker := imagerefchecker.New(imagerefchecker.Opt{
                ImageStore:  dist.ImageStore,
                LayerGetter: layerGetter,
        })

        cm, err := cache.NewManager(cache.ManagerOpt{
                Snapshotter:     snapshotter,
                MetadataStore:   md,
                PruneRefChecker: refChecker,
                LeaseManager:    lm,
                ContentStore:    store,
                GarbageCollect:  mdb.GarbageCollect,
                Root:            root,
        })
        if err != nil {
                return nil, err
        }

        src, err := containerimage.NewSource(containerimage.SourceOpt{
                CacheAccessor:   cm,
                ContentStore:    store,
                DownloadManager: dist.DownloadManager,
                MetadataStore:   dist.V2MetadataService,
                ImageStore:      dist.ImageStore,
                ReferenceStore:  dist.ReferenceStore,
                RegistryHosts:   opt.RegistryHosts,
                LayerStore:      dist.LayerStore,
                LeaseManager:    lm,
                GarbageCollect:  mdb.GarbageCollect,
        })
        if err != nil {
                return nil, err
        }

        dns := getDNSConfig(opt.DNSConfig)

        cdiManager, err := getCDIManager(opt)
        if err != nil {
                return nil, err
        }

        exec, err := newExecutorGD(
                root,
                opt.DefaultCgroupParent,
                opt.NetworkController,
                dns,
                opt.Rootless,
                opt.IdentityMapping,
                opt.ApparmorProfile,
                cdiManager,
                opt.ContainerdAddress,
                opt.ContainerdNamespace,
        )
        if err != nil {
                return nil, err
        }

        differ, ok := snapshotter.(mobyexporter.Differ)
        if !ok {
                return nil, errors.Errorf("snapshotter doesn't support differ")
        }

        exp, err := mobyexporter.New(mobyexporter.Opt{
                ImageStore:            dist.ImageStore,
                ContentStore:          store,
                Differ:                differ,
                ImageTagger:           opt.ImageTagger,
                LeaseManager:          lm,
                ImageExportedCallback: opt.Callbacks.Exported,
                // Callbacks.Named is not used here because the tag operation is handled directly by the image service.
        })
        if err != nil {
                return nil, err
        }

        cacheStorage, err := bboltcachestorage.NewStore(filepath.Join(opt.Root, "cache.db"))
        if err != nil {
                return nil, err
        }

        historyDB, historyConf, err := openHistoryDB(opt.Root, "history.db", opt.BuilderConfig.History)
        if err != nil {
                return nil, err
        }

        gcPolicy, err := getGCPolicy(opt.BuilderConfig, root)
        if err != nil {
                return nil, errors.Wrap(err, "could not get builder GC policy")
        }

        layers, ok := snapshotter.(mobyworker.LayerAccess)
        if !ok {
                return nil, errors.Errorf("snapshotter doesn't support differ")
        }

        leases, err := lm.List(ctx, `labels."buildkit/lease.temporary"`)
        if err != nil {
                return nil, err
        }
        for _, l := range leases {
                lm.Delete(ctx, l)
        }

        wopt := mobyworker.Opt{
                ID:                opt.EngineID,
                ContentStore:      store,
                CacheManager:      cm,
                GCPolicy:          gcPolicy,
                Snapshotter:       snapshotter,
                Executor:          exec,
                ImageSource:       src,
                DownloadManager:   dist.DownloadManager,
                V2MetadataService: dist.V2MetadataService,
                Exporter:          exp,
                Transport:         rt,
                Layers:            layers,
                Platforms:         archutil.SupportedPlatforms(true),
                LeaseManager:      lm,
                GarbageCollect:    mdb.GarbageCollect,
                Labels:            getLabels(opt, nil),
                CDIManager:        cdiManager,
        }

        wc := &worker.Controller{}
        w, err := mobyworker.NewWorker(wopt)
        if err != nil {
                return nil, err
        }
        wc.Add(w)

        gwf, err := gateway.NewGatewayFrontend(wc.Infos(), nil)
        if err != nil {
                return nil, err
        }

        frontends := map[string]frontend.Frontend{
                "dockerfile.v0": forwarder.NewGatewayForwarder(wc.Infos(), dockerfile.Build),
                "gateway.v0":    gwf,
        }

        return control.NewController(control.Opt{
                SessionManager:   opt.SessionManager,
                WorkerController: wc,
                Frontends:        frontends,
                CacheManager:     solver.NewCacheManager(ctx, "local", cacheStorage, worker.NewCacheResultStorage(wc)),
                CacheStore:       cacheStorage,
                ResolveCacheImporterFuncs: map[string]remotecache.ResolveCacheImporterFunc{
                        "registry": localinlinecache.ResolveCacheImporterFunc(opt.SessionManager, opt.RegistryHosts, store, dist.ReferenceStore, dist.ImageStore),
                        "local":    localremotecache.ResolveCacheImporterFunc(opt.SessionManager),
                },
                ResolveCacheExporterFuncs: map[string]remotecache.ResolveCacheExporterFunc{
                        "inline": inlineremotecache.ResolveCacheExporterFunc(),
                },
                Entitlements:   getEntitlements(opt.BuilderConfig),
                LeaseManager:   lm,
                ContentStore:   store,
                HistoryDB:      historyDB,
                HistoryConfig:  historyConf,
                TraceCollector: getTraceExporter(ctx),
                GarbageCollect: w.GarbageCollect,
        })
}

func getGCPolicy(conf config.BuilderConfig, root string) ([]client.PruneInfo, error) {
        var gcPolicy []client.PruneInfo
        if conf.GC.IsEnabled() {
                if conf.GC.Policy == nil {
                        reservedSpace, maxUsedSpace, minFreeSpace, err := parseGCPolicy(config.BuilderGCRule{
                                ReservedSpace: conf.GC.DefaultReservedSpace,
                                MaxUsedSpace:  conf.GC.DefaultMaxUsedSpace,
                                MinFreeSpace:  conf.GC.DefaultMinFreeSpace,
                        }, "default")
                        if err != nil {
                                return nil, err
                        }
                        gcPolicy = mobyworker.DefaultGCPolicy(root, reservedSpace, maxUsedSpace, minFreeSpace)
                } else {
                        gcPolicy = make([]client.PruneInfo, len(conf.GC.Policy))
                        for i, p := range conf.GC.Policy {
                                reservedSpace, maxUsedSpace, minFreeSpace, err := parseGCPolicy(p, "")
                                if err != nil {
                                        return nil, err
                                }

                                gcPolicy[i], err = toBuildkitPruneInfo(build.CachePruneOptions{
                                        All:           p.All,
                                        ReservedSpace: reservedSpace,
                                        MaxUsedSpace:  maxUsedSpace,
                                        MinFreeSpace:  minFreeSpace,
                                        Filters:       filters.Args(p.Filter),
                                })
                                if err != nil {
                                        return nil, err
                                }
                        }
                }
        }
        return gcPolicy, nil
}

func parseGCPolicy(p config.BuilderGCRule, prefix string) (reservedSpace, maxUsedSpace, minFreeSpace int64, err error) {
        errorString := func(key string) string {
                if prefix != "" {
                        key = prefix + strings.ToTitle(key)
                }
                return fmt.Sprintf("failed to parse %s", key)
        }

        if p.ReservedSpace != "" {
                b, err := units.RAMInBytes(p.ReservedSpace)
                if err != nil {
                        return 0, 0, 0, errors.Wrap(err, errorString("reservedSpace"))
                }
                reservedSpace = b
        }

        if p.MaxUsedSpace != "" {
                b, err := units.RAMInBytes(p.MaxUsedSpace)
                if err != nil {
                        return 0, 0, 0, errors.Wrap(err, errorString("maxUsedSpace"))
                }
                maxUsedSpace = b
        }

        if p.MinFreeSpace != "" {
                b, err := units.RAMInBytes(p.MinFreeSpace)
                if err != nil {
                        return 0, 0, 0, errors.Wrap(err, errorString("minFreeSpace"))
                }
                minFreeSpace = b
        }

        return reservedSpace, maxUsedSpace, minFreeSpace, nil
}

func getEntitlements(conf config.BuilderConfig) []string {
        var ents []string
        // Incase of no config settings, NetworkHost should be enabled & SecurityInsecure must be disabled.
        if conf.Entitlements.NetworkHost == nil || *conf.Entitlements.NetworkHost {
                ents = append(ents, string(entitlements.EntitlementNetworkHost))
        }
        if conf.Entitlements.SecurityInsecure != nil && *conf.Entitlements.SecurityInsecure {
                ents = append(ents, string(entitlements.EntitlementSecurityInsecure))
        }
        return ents
}

func getLabels(opt Opt, labels map[string]string) map[string]string {
        if labels == nil {
                labels = make(map[string]string)
        }
        if len(opt.DNSConfig.HostGatewayIPs) > 0 {
                // TODO(robmry) - buildx has its own version of toBuildkitExtraHosts(), which
                //   needs to be updated to understand >1 address. For now, take the IPv4 address
                //   if there is one, else IPv6.
                for _, gip := range opt.DNSConfig.HostGatewayIPs {
                        labels[wlabel.HostGatewayIP] = gip.String()
                        if gip.Is4() {
                                break
                        }
                }
        }
        return labels
}

func getCDIManager(opt Opt) (*cdidevices.Manager, error) {
        if opt.CDICache == nil {
                return nil, nil
        }
        // TODO: add support for auto-allowed devices from config
        return cdidevices.NewManager(opt.CDICache, nil), nil
}

package buildkit

import (
        "context"
        "net"
        "os"
        "path/filepath"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/config"
        "github.com/docker/docker/daemon/libnetwork"
        "github.com/moby/buildkit/executor/oci"
        resourcestypes "github.com/moby/buildkit/executor/resources/types"
        "github.com/moby/buildkit/identity"
        "github.com/moby/buildkit/util/network"
)

type bridgeProvider struct {
        *libnetwork.Controller
        Root string
}

type lnInterface struct {
        ep  *libnetwork.Endpoint
        sbx *libnetwork.Sandbox
        sync.Once
        err      error
        ready    chan struct{}
        provider *bridgeProvider
}

func (p *bridgeProvider) New(_ context.Context, _ string) (network.Namespace, error) {
        n, err := p.NetworkByName(networkName)
        if err != nil {
                return nil, err
        }

        iface := &lnInterface{ready: make(chan struct{}), provider: p}
        iface.Once.Do(func() {
                go iface.init(p.Controller, n)
        })

        return iface, nil
}

func (p *bridgeProvider) Close() error {
        return nil
}

func (iface *lnInterface) init(c *libnetwork.Controller, n *libnetwork.Network) {
        defer close(iface.ready)
        id := identity.NewID()

        ep, err := n.CreateEndpoint(context.TODO(), id, libnetwork.CreateOptionDisableResolution())
        if err != nil {
                iface.err = err
                return
        }

        sbx, err := c.NewSandbox(
                context.TODO(),
                id,
                libnetwork.OptionUseExternalKey(),
                libnetwork.OptionHostsPath(filepath.Join(iface.provider.Root, id, "hosts")),
                libnetwork.OptionResolvConfPath(filepath.Join(iface.provider.Root, id, "resolv.conf")),
        )
        if err != nil {
                iface.err = err
                return
        }

        if err := ep.Join(context.TODO(), sbx); err != nil {
                iface.err = err
                return
        }

        iface.sbx = sbx
        iface.ep = ep
}

// TODO(neersighted): Unstub Sample(), and collect data from the libnetwork Endpoint.
func (iface *lnInterface) Sample() (*resourcestypes.NetworkSample, error) {
        return &resourcestypes.NetworkSample{}, nil
}

func (iface *lnInterface) Close() error {
        <-iface.ready
        if iface.sbx != nil {
                go func() {
                        if err := iface.sbx.Delete(context.TODO()); err != nil {
                                log.G(context.TODO()).WithError(err).Errorf("failed to delete builder network sandbox")
                        }
                        if err := os.RemoveAll(filepath.Join(iface.provider.Root, iface.sbx.ContainerID())); err != nil {
                                log.G(context.TODO()).WithError(err).Errorf("failed to delete builder sandbox directory")
                        }
                }()
        }
        return iface.err
}

func getDNSConfig(cfg config.DNSConfig) *oci.DNSConfig {
        if cfg.DNS != nil || cfg.DNSSearch != nil || cfg.DNSOptions != nil {
                return &oci.DNSConfig{
                        Nameservers:   ipAddresses(cfg.DNS),
                        SearchDomains: cfg.DNSSearch,
                        Options:       cfg.DNSOptions,
                }
        }
        return nil
}

func ipAddresses(ips []net.IP) []string {
        var addrs []string
        for _, ip := range ips {
                addrs = append(addrs, ip.String())
        }
        return addrs
}

package buildkit

import (
        "context"
        "os"
        "path/filepath"
        "strconv"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/buildkit/executor"
        "github.com/moby/buildkit/executor/oci"
        "github.com/moby/buildkit/executor/resources"
        "github.com/moby/buildkit/executor/runcexecutor"
        "github.com/moby/buildkit/solver/llbsolver/cdidevices"
        "github.com/moby/buildkit/solver/pb"
        "github.com/moby/buildkit/util/network"
        "github.com/moby/sys/user"
        "github.com/opencontainers/runtime-spec/specs-go"
)

const networkName = "bridge"

func newExecutor(root, cgroupParent string, net *libnetwork.Controller, dnsConfig *oci.DNSConfig, rootless bool, idmap user.IdentityMapping, apparmorProfile string, cdiManager *cdidevices.Manager, _, _ string) (executor.Executor, error) {
        netRoot := filepath.Join(root, "net")
        networkProviders := map[pb.NetMode]network.Provider{
                pb.NetMode_UNSET: &bridgeProvider{Controller: net, Root: netRoot},
                pb.NetMode_HOST:  network.NewHostProvider(),
                pb.NetMode_NONE:  network.NewNoneProvider(),
        }

        // make sure net state directory is cleared from previous state
        fis, err := os.ReadDir(netRoot)
        if err == nil {
                for _, fi := range fis {
                        fp := filepath.Join(netRoot, fi.Name())
                        if err := os.RemoveAll(fp); err != nil {
                                log.G(context.TODO()).WithError(err).Errorf("failed to delete old network state: %v", fp)
                        }
                }
        }

        // Returning a non-nil but empty *IdentityMapping breaks BuildKit:
        // https://github.com/moby/moby/pull/39444
        pidmap := &idmap
        if idmap.Empty() {
                pidmap = nil
        }

        rm, err := resources.NewMonitor()
        if err != nil {
                return nil, err
        }

        runcCmds := []string{"runc"}

        // TODO: FIXME: testing env var, replace with something better or remove in a major version or two
        if runcOverride := os.Getenv("DOCKER_BUILDKIT_RUNC_COMMAND"); runcOverride != "" {
                runcCmds = []string{runcOverride}
        }

        return runcexecutor.New(runcexecutor.Opt{
                Root:                filepath.Join(root, "executor"),
                CommandCandidates:   runcCmds,
                DefaultCgroupParent: cgroupParent,
                Rootless:            rootless,
                NoPivot:             os.Getenv("DOCKER_RAMDISK") != "",
                IdentityMapping:     pidmap,
                DNS:                 dnsConfig,
                ApparmorProfile:     apparmorProfile,
                ResourceMonitor:     rm,
                CDIManager:          cdiManager,
        }, networkProviders)
}

// newExecutorGD calls newExecutor() on Linux.
// Created for symmetry with the non-linux platforms, esp. Windows.
func newExecutorGD(root, cgroupParent string, net *libnetwork.Controller, dnsConfig *oci.DNSConfig, rootless bool, idmap user.IdentityMapping, apparmorProfile string, cdiManager *cdidevices.Manager, _, _ string) (executor.Executor, error) {
        return newExecutor(
                root,
                cgroupParent,
                net,
                dnsConfig,
                rootless,
                idmap,
                apparmorProfile,
                cdiManager,
                "",
                "",
        )
}

func (iface *lnInterface) Set(s *specs.Spec) error {
        <-iface.ready
        if iface.err != nil {
                log.G(context.TODO()).WithError(iface.err).Error("failed to set networking spec")
                return iface.err
        }
        shortNetCtlrID := stringid.TruncateID(iface.provider.Controller.ID())
        // attach netns to bridge within the container namespace, using reexec in a prestart hook
        s.Hooks = &specs.Hooks{
                Prestart: []specs.Hook{{
                        Path: filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"),
                        Args: []string{"libnetwork-setkey", "-exec-root=" + iface.provider.Config().ExecRoot, iface.sbx.ContainerID(), shortNetCtlrID},
                }},
        }
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package mobyexporter

import (
        "bytes"
        "context"
        "fmt"
        "slices"
        "strings"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/leases"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/moby/buildkit/exporter"
        "github.com/moby/buildkit/exporter/containerimage"
        "github.com/moby/buildkit/exporter/containerimage/exptypes"
        "github.com/moby/buildkit/util/leaseutil"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// Differ can make a moby layer from a snapshot
type Differ interface {
        EnsureLayer(ctx context.Context, key string) ([]layer.DiffID, error)
}

type ImageTagger interface {
        TagImage(ctx context.Context, imageID image.ID, newTag reference.Named) error
}

// Opt defines a struct for creating new exporter
type Opt struct {
        ImageStore            image.Store
        Differ                Differ
        ImageTagger           ImageTagger
        ContentStore          content.Store
        LeaseManager          leases.Manager
        ImageExportedCallback func(ctx context.Context, id string, desc ocispec.Descriptor)
}

type imageExporter struct {
        opt Opt
}

// New creates a new moby imagestore exporter
func New(opt Opt) (exporter.Exporter, error) {
        im := &imageExporter{opt: opt}
        return im, nil
}

func (e *imageExporter) Resolve(ctx context.Context, id int, attrs map[string]string) (exporter.ExporterInstance, error) {
        i := &imageExporterInstance{
                imageExporter: e,
                id:            id,
                attrs:         attrs,
        }
        for k, v := range attrs {
                switch exptypes.ImageExporterOptKey(k) {
                case exptypes.OptKeyName:
                        for _, v := range strings.Split(v, ",") {
                                ref, err := reference.ParseNormalizedNamed(v)
                                if err != nil {
                                        return nil, err
                                }
                                i.targetNames = append(i.targetNames, ref)
                        }
                default:
                        if i.meta == nil {
                                i.meta = make(map[string][]byte)
                        }
                        i.meta[k] = []byte(v)
                }
        }
        return i, nil
}

type imageExporterInstance struct {
        *imageExporter
        id          int
        targetNames []reference.Named
        meta        map[string][]byte
        attrs       map[string]string
}

func (e *imageExporterInstance) ID() int {
        return e.id
}

func (e *imageExporterInstance) Type() string {
        return "image"
}

func (e *imageExporterInstance) Name() string {
        return "exporting to image"
}

func (e *imageExporterInstance) Config() *exporter.Config {
        return exporter.NewConfig()
}

func (e *imageExporterInstance) Attrs() map[string]string {
        return e.attrs
}

func (e *imageExporterInstance) Export(ctx context.Context, inp *exporter.Source, inlineCache exptypes.InlineCache, sessionID string) (map[string]string, exporter.DescriptorReference, error) {
        if len(inp.Refs) > 1 {
                return nil, nil, errors.New("exporting multiple references to image store is currently unsupported")
        }

        ref := inp.Ref
        if ref != nil && len(inp.Refs) == 1 {
                return nil, nil, errors.New("invalid exporter input: Ref and Refs are mutually exclusive")
        }

        // only one loop
        for _, v := range inp.Refs {
                ref = v
        }

        var config []byte
        switch len(inp.Refs) {
        case 0:
                config = inp.Metadata[exptypes.ExporterImageConfigKey]
        case 1:
                ps, err := exptypes.ParsePlatforms(inp.Metadata)
                if err != nil {
                        return nil, nil, fmt.Errorf("cannot export image, failed to parse platforms: %w", err)
                }
                if len(ps.Platforms) != len(inp.Refs) {
                        return nil, nil, errors.Errorf("number of platforms does not match references %d %d", len(ps.Platforms), len(inp.Refs))
                }
                config = inp.Metadata[fmt.Sprintf("%s/%s", exptypes.ExporterImageConfigKey, ps.Platforms[0].ID)]
        }

        var diffs []digest.Digest
        if ref != nil {
                layersDone := oneOffProgress(ctx, "exporting layers")

                if err := ref.Finalize(ctx); err != nil {
                        return nil, nil, layersDone(err)
                }

                if err := ref.Extract(ctx, nil); err != nil {
                        return nil, nil, err
                }

                diffIDs, err := e.opt.Differ.EnsureLayer(ctx, ref.ID())
                if err != nil {
                        return nil, nil, layersDone(err)
                }

                diffs = slices.Clone(diffIDs)

                _ = layersDone(nil)
        }

        if len(config) == 0 {
                var err error
                config, err = emptyImageConfig()
                if err != nil {
                        return nil, nil, err
                }
        }

        history, err := parseHistoryFromConfig(config)
        if err != nil {
                return nil, nil, err
        }

        diffs, history = normalizeLayersAndHistory(diffs, history, ref)

        var inlineCacheEntry *exptypes.InlineCacheEntry
        if inlineCache != nil {
                inlineCacheResult, err := inlineCache(ctx)
                if err != nil {
                        return nil, nil, err
                }
                if inlineCacheResult != nil {
                        if ref != nil {
                                inlineCacheEntry, _ = inlineCacheResult.FindRef(ref.ID())
                        } else {
                                inlineCacheEntry = inlineCacheResult.Ref
                        }
                }
        }
        config, err = patchImageConfig(config, diffs, history, inlineCacheEntry)
        if err != nil {
                return nil, nil, err
        }

        configDigest := digest.FromBytes(config)

        configDone := oneOffProgress(ctx, fmt.Sprintf("writing image %s", configDigest))
        id, err := e.opt.ImageStore.Create(config)
        if err != nil {
                return nil, nil, configDone(err)
        }
        _ = configDone(nil)

        var names []string
        for _, targetName := range e.targetNames {
                names = append(names, targetName.String())
                if e.opt.ImageTagger != nil {
                        tagDone := oneOffProgress(ctx, "naming to "+targetName.String())
                        if err := e.opt.ImageTagger.TagImage(ctx, image.ID(digest.Digest(id)), targetName); err != nil {
                                return nil, nil, tagDone(err)
                        }
                        _ = tagDone(nil)
                }
        }

        resp := map[string]string{
                exptypes.ExporterImageConfigDigestKey: configDigest.String(),
                exptypes.ExporterImageDigestKey:       id.String(),
        }
        if len(names) > 0 {
                resp["image.name"] = strings.Join(names, ",")
        }

        descRef, err := e.newTempReference(ctx, config)
        if err != nil {
                return nil, nil, fmt.Errorf("failed to create a temporary descriptor reference: %w", err)
        }

        if e.opt.ImageExportedCallback != nil {
                e.opt.ImageExportedCallback(ctx, id.String(), descRef.Descriptor())
        }

        return resp, descRef, nil
}

func (e *imageExporterInstance) newTempReference(ctx context.Context, config []byte) (exporter.DescriptorReference, error) {
        lm := e.opt.LeaseManager

        dgst := digest.FromBytes(config)
        leaseCtx, done, err := leaseutil.WithLease(ctx, lm, leaseutil.MakeTemporary)
        if err != nil {
                return nil, err
        }

        unlease := func(ctx context.Context) error {
                err := done(context.WithoutCancel(ctx))
                if err != nil {
                        log.G(ctx).WithError(err).Error("failed to delete descriptor reference lease")
                }
                return err
        }

        desc := ocispec.Descriptor{
                Digest:    dgst,
                MediaType: "application/vnd.docker.container.image.v1+json",
                Size:      int64(len(config)),
        }

        if err := content.WriteBlob(leaseCtx, e.opt.ContentStore, desc.Digest.String(), bytes.NewReader(config), desc); err != nil {
                unlease(leaseCtx)
                return nil, fmt.Errorf("failed to save temporary image config: %w", err)
        }

        return containerimage.NewDescriptorReference(desc, unlease), nil
}

package mobyexporter

import (
        "context"
        "encoding/json"
        "time"

        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/moby/buildkit/cache"
        "github.com/moby/buildkit/exporter/containerimage/exptypes"
        "github.com/moby/buildkit/util/progress"
        "github.com/moby/buildkit/util/system"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

func emptyImageConfig() ([]byte, error) {
        pl := platforms.Normalize(platforms.DefaultSpec())
        dt, err := json.Marshal(ocispec.Image{
                Platform: pl,
                Config: ocispec.ImageConfig{
                        WorkingDir: "/",
                        Env:        []string{"PATH=" + system.DefaultPathEnv(pl.OS)},
                },
                RootFS: ocispec.RootFS{Type: "layers"},
        })
        return dt, errors.Wrap(err, "failed to create empty image config")
}

func parseHistoryFromConfig(dt []byte) ([]ocispec.History, error) {
        var config struct {
                History []ocispec.History
        }
        if err := json.Unmarshal(dt, &config); err != nil {
                return nil, errors.Wrap(err, "failed to unmarshal history from config")
        }
        return config.History, nil
}

func patchImageConfig(dt []byte, dps []digest.Digest, history []ocispec.History, cache *exptypes.InlineCacheEntry) ([]byte, error) {
        m := map[string]json.RawMessage{}
        if err := json.Unmarshal(dt, &m); err != nil {
                return nil, errors.Wrap(err, "failed to parse image config for patch")
        }

        if m == nil {
                return nil, errors.New("null image config")
        }

        var rootFS ocispec.RootFS
        rootFS.Type = "layers"
        rootFS.DiffIDs = append(rootFS.DiffIDs, dps...)

        dt, err := json.Marshal(rootFS)
        if err != nil {
                return nil, errors.Wrap(err, "failed to marshal rootfs")
        }
        m["rootfs"] = dt

        dt, err = json.Marshal(history)
        if err != nil {
                return nil, errors.Wrap(err, "failed to marshal history")
        }
        m["history"] = dt

        if _, ok := m["created"]; !ok {
                var tm *time.Time
                for _, h := range history {
                        if h.Created != nil {
                                tm = h.Created
                        }
                }
                dt, err = json.Marshal(&tm)
                if err != nil {
                        return nil, errors.Wrap(err, "failed to marshal creation time")
                }
                m["created"] = dt
        }

        if cache != nil {
                dt, err = json.Marshal(cache.Data)
                if err != nil {
                        return nil, err
                }
                m["moby.buildkit.cache.v0"] = dt
        }

        dt, err = json.Marshal(m)
        return dt, errors.Wrap(err, "failed to marshal config after patch")
}

func normalizeLayersAndHistory(diffs []digest.Digest, history []ocispec.History, ref cache.ImmutableRef) ([]digest.Digest, []ocispec.History) {
        refMeta := getRefMetadata(ref, len(diffs))
        var historyLayers int
        for _, h := range history {
                if !h.EmptyLayer {
                        historyLayers++
                }
        }
        if historyLayers > len(diffs) {
                // this case shouldn't happen but if it does force set history layers empty
                // from the bottom
                log.G(context.TODO()).Warn("invalid image config with unaccounted layers")
                historyCopy := make([]ocispec.History, 0, len(history))
                var l int
                for _, h := range history {
                        if l >= len(diffs) {
                                h.EmptyLayer = true
                        }
                        if !h.EmptyLayer {
                                l++
                        }
                        historyCopy = append(historyCopy, h)
                }
                history = historyCopy
        }

        if len(diffs) > historyLayers {
                // some history items are missing. add them based on the ref metadata
                for _, md := range refMeta[historyLayers:] {
                        history = append(history, ocispec.History{
                                Created:   md.createdAt,
                                CreatedBy: md.description,
                                Comment:   "buildkit.exporter.image.v0",
                        })
                }
        }

        var layerIndex int
        for i, h := range history {
                if !h.EmptyLayer {
                        if h.Created == nil {
                                h.Created = refMeta[layerIndex].createdAt
                        }
                        layerIndex++
                }
                history[i] = h
        }

        // Find the first new layer time. Otherwise, the history item for a first
        // metadata command would be the creation time of a base image layer.
        // If there is no such then the last layer with timestamp.
        var created *time.Time
        var noCreatedTime bool
        for _, h := range history {
                if h.Created != nil {
                        created = h.Created
                        if noCreatedTime {
                                break
                        }
                } else {
                        noCreatedTime = true
                }
        }

        // Fill in created times for all history items to be either the first new
        // layer time or the previous layer.
        noCreatedTime = false
        for i, h := range history {
                if h.Created != nil {
                        if noCreatedTime {
                                created = h.Created
                        }
                } else {
                        noCreatedTime = true
                        h.Created = created
                }
                history[i] = h
        }

        return diffs, history
}

type refMetadata struct {
        description string
        createdAt   *time.Time
}

func getRefMetadata(ref cache.ImmutableRef, limit int) []refMetadata {
        if ref == nil {
                return make([]refMetadata, limit)
        }

        layerChain := ref.LayerChain()
        defer layerChain.Release(context.TODO())

        if limit < len(layerChain) {
                layerChain = layerChain[len(layerChain)-limit:]
        }

        metas := make([]refMetadata, len(layerChain))
        for i, layer := range layerChain {
                meta := &metas[i]

                if description := layer.GetDescription(); description != "" {
                        meta.description = description
                } else {
                        meta.description = "created by buildkit" // shouldn't be shown but don't fail build
                }

                createdAt := layer.GetCreatedAt()
                meta.createdAt = &createdAt
        }
        return metas
}

func oneOffProgress(ctx context.Context, id string) func(err error) error {
        pw, _, _ := progress.NewFromContext(ctx)
        now := time.Now()
        st := progress.Status{
                Started: &now,
        }
        _ = pw.Write(id, st)
        return func(err error) error {
                // TODO: set error on status
                now := time.Now()
                st.Completed = &now
                _ = pw.Write(id, st)
                _ = pw.Close()
                return err
        }
}

package overrides

import (
        "errors"

        "github.com/distribution/reference"
)

// SanitizeRepoAndTags parses the raw names to a slice of repoAndTag.
// It removes duplicates and validates each repoName and tag to not contain a digest.
func SanitizeRepoAndTags(names []string) (repoAndTags []string, _ error) {
        uniqNames := map[string]struct{}{}
        for _, repo := range names {
                if repo == "" {
                        continue
                }

                ref, err := reference.ParseNormalizedNamed(repo)
                if err != nil {
                        return nil, err
                }

                if _, ok := ref.(reference.Digested); ok {
                        return nil, errors.New("build tag cannot contain a digest")
                }

                nameWithTag := reference.TagNameOnly(ref).String()
                if _, exists := uniqNames[nameWithTag]; !exists {
                        uniqNames[nameWithTag] = struct{}{}
                        repoAndTags = append(repoAndTags, nameWithTag)
                }
        }
        return repoAndTags, nil
}

package exporter

import (
        "context"
        "strings"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/docker/daemon/internal/builder-next/exporter/overrides"
        "github.com/moby/buildkit/exporter"
        "github.com/moby/buildkit/exporter/containerimage/exptypes"

        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

type BuildkitCallbacks struct {
        // Exported is a Called when an image is exported by buildkit.
        Exported func(ctx context.Context, id string, desc ocispec.Descriptor)

        // Named is a callback that is called when an image is created in the
        // containerd image store by buildkit.
        Named func(ctx context.Context, ref reference.NamedTagged, desc ocispec.Descriptor)
}

// Wraps the containerimage exporter's Resolve method to apply moby-specific
// overrides to the exporter attributes.
type imageExporterMobyWrapper struct {
        exp       exporter.Exporter
        callbacks BuildkitCallbacks
}

// NewWrapper returns an exporter wrapper that applies moby specific attributes
// and hooks the export process.
func NewWrapper(exp exporter.Exporter, callbacks BuildkitCallbacks) (exporter.Exporter, error) {
        return &imageExporterMobyWrapper{
                exp:       exp,
                callbacks: callbacks,
        }, nil
}

// Resolve applies moby specific attributes to the request.
func (e *imageExporterMobyWrapper) Resolve(ctx context.Context, id int, exporterAttrs map[string]string) (exporter.ExporterInstance, error) {
        if exporterAttrs == nil {
                exporterAttrs = make(map[string]string)
        }
        reposAndTags, err := overrides.SanitizeRepoAndTags(strings.Split(exporterAttrs[string(exptypes.OptKeyName)], ","))
        if err != nil {
                return nil, err
        }
        exporterAttrs[string(exptypes.OptKeyName)] = strings.Join(reposAndTags, ",")
        exporterAttrs[string(exptypes.OptKeyUnpack)] = "true"
        if _, has := exporterAttrs[string(exptypes.OptKeyDanglingPrefix)]; !has {
                exporterAttrs[string(exptypes.OptKeyDanglingPrefix)] = "moby-dangling"
        }
        exporterAttrs[string(exptypes.OptKeyDanglingEmptyOnly)] = "true"

        inst, err := e.exp.Resolve(ctx, id, exporterAttrs)
        if err != nil {
                return nil, err
        }

        return &imageExporterInstanceWrapper{
                ExporterInstance: inst,
                callbacks:        e.callbacks,
        }, nil
}

type imageExporterInstanceWrapper struct {
        exporter.ExporterInstance
        callbacks BuildkitCallbacks
}

func (i *imageExporterInstanceWrapper) Export(ctx context.Context, src *exporter.Source, inlineCache exptypes.InlineCache, sessionID string) (map[string]string, exporter.DescriptorReference, error) {
        out, ref, err := i.ExporterInstance.Export(ctx, src, inlineCache, sessionID)
        if err != nil {
                return out, ref, err
        }

        desc := ref.Descriptor()
        imageID := out[exptypes.ExporterImageDigestKey]
        if i.callbacks.Exported != nil {
                i.callbacks.Exported(ctx, imageID, desc)
        }

        if i.callbacks.Named != nil {
                i.processNamedCallback(ctx, out, desc)
        }

        return out, ref, nil
}

func (i *imageExporterInstanceWrapper) processNamedCallback(ctx context.Context, out map[string]string, desc ocispec.Descriptor) {
        // TODO(vvoland): Change to exptypes.ExporterImageNameKey when BuildKit v0.21 is vendored.
        imageName := out["image.name"]
        if imageName == "" {
                log.G(ctx).Warn("image named with empty image.name produced by buildkit")
                return
        }

        for _, name := range strings.Split(imageName, ",") {
                ref, err := reference.ParseNormalizedNamed(name)
                if err != nil {
                        // Shouldn't happen, but log if it does and continue.
                        log.G(ctx).WithFields(log.Fields{
                                "name":  name,
                                "error": err,
                        }).Warn("image named with invalid reference produced by buildkit")
                        continue
                }

                if namedTagged, ok := reference.TagNameOnly(ref).(reference.NamedTagged); ok {
                        i.callbacks.Named(ctx, namedTagged, desc)
                }
        }
}

package imagerefchecker

import (
        "sync"

        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/moby/buildkit/cache"
        "github.com/opencontainers/go-digest"
)

// LayerGetter abstracts away the snapshotter
type LayerGetter interface {
        GetLayer(string) (layer.Layer, error)
}

// Opt represents the options needed to create a refchecker
type Opt struct {
        LayerGetter LayerGetter
        ImageStore  image.Store
}

// New creates new image reference checker that can be used to see if a reference
// is being used by any of the images in the image store
func New(opt Opt) cache.ExternalRefCheckerFunc {
        return func() (cache.ExternalRefChecker, error) {
                return &checker{opt: opt, layers: lchain{}, cache: map[string]bool{}}, nil
        }
}

type lchain map[layer.DiffID]lchain

func (c lchain) add(ids []layer.DiffID) {
        if len(ids) == 0 {
                return
        }
        id := ids[0]
        ch, ok := c[id]
        if !ok {
                ch = lchain{}
                c[id] = ch
        }
        ch.add(ids[1:])
}

func (c lchain) has(ids []layer.DiffID) bool {
        if len(ids) == 0 {
                return true
        }
        ch, ok := c[ids[0]]
        return ok && ch.has(ids[1:])
}

type checker struct {
        opt    Opt
        once   sync.Once
        layers lchain
        cache  map[string]bool
}

func (c *checker) Exists(key string, chain []digest.Digest) bool {
        if c.opt.ImageStore == nil {
                return false
        }

        c.once.Do(c.init)

        if b, ok := c.cache[key]; ok {
                return b
        }

        l, err := c.opt.LayerGetter.GetLayer(key)
        if err != nil || l == nil {
                c.cache[key] = false
                return false
        }

        ok := c.layers.has(diffIDs(l))
        c.cache[key] = ok
        return ok
}

func (c *checker) init() {
        imgs := c.opt.ImageStore.Map()

        for _, img := range imgs {
                c.layers.add(img.RootFS.DiffIDs)
        }
}

func diffIDs(l layer.Layer) []layer.DiffID {
        p := l.Parent()
        if p == nil {
                return []layer.DiffID{l.DiffID()}
        }
        return append(diffIDs(p), l.DiffID())
}

package buildkit

import (
        "io"
        "net/http"
        "strings"
        "sync"

        "github.com/moby/buildkit/identity"
        "github.com/pkg/errors"
)

const urlPrefix = "build-context-"

type reqBodyHandler struct {
        mu sync.Mutex
        rt http.RoundTripper

        requests map[string]io.ReadCloser
}

func newReqBodyHandler(rt http.RoundTripper) *reqBodyHandler {
        return &reqBodyHandler{
                rt:       rt,
                requests: map[string]io.ReadCloser{},
        }
}

func (h *reqBodyHandler) newRequest(rc io.ReadCloser) (string, func()) {
        id := identity.NewID()
        h.mu.Lock()
        h.requests[id] = rc
        h.mu.Unlock()
        return "http://" + urlPrefix + id, func() {
                h.mu.Lock()
                delete(h.requests, id)
                h.mu.Unlock()
                rc.Close()
        }
}

func (h *reqBodyHandler) RoundTrip(req *http.Request) (*http.Response, error) {
        host := req.URL.Host
        if strings.HasPrefix(host, urlPrefix) {
                if req.Method != http.MethodGet {
                        return nil, errors.Errorf("invalid request")
                }
                id := strings.TrimPrefix(host, urlPrefix)
                h.mu.Lock()
                rc, ok := h.requests[id]
                delete(h.requests, id)
                h.mu.Unlock()

                if !ok {
                        return nil, errors.Errorf("context not found")
                }

                resp := &http.Response{
                        Status:        "200 OK",
                        StatusCode:    http.StatusOK,
                        Body:          rc,
                        ContentLength: -1,
                }

                return resp, nil
        }
        return h.rt.RoundTrip(req)
}

package worker

import (
        "context"
        nethttp "net/http"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/internal/builder-next/exporter"
        "github.com/moby/buildkit/client"
        bkexporter "github.com/moby/buildkit/exporter"
        "github.com/moby/buildkit/session"
        "github.com/moby/buildkit/source/http"
        "github.com/moby/buildkit/worker/base"
)

// ContainerdWorker is a local worker instance with dedicated snapshotter, cache, and so on.
type ContainerdWorker struct {
        *base.Worker
        callbacks exporter.BuildkitCallbacks
}

// NewContainerdWorker instantiates a local worker.
func NewContainerdWorker(ctx context.Context, wo base.WorkerOpt, callbacks exporter.BuildkitCallbacks, rt nethttp.RoundTripper) (*ContainerdWorker, error) {
        bw, err := base.NewWorker(ctx, wo)
        if err != nil {
                return nil, err
        }
        hs, err := http.NewSource(http.Opt{
                CacheAccessor: bw.CacheManager(),
                Transport:     rt,
        })
        if err == nil {
                bw.SourceManager.Register(hs)
        } else {
                log.G(ctx).Warnf("Could not register builder http source: %s", err)
        }

        return &ContainerdWorker{Worker: bw, callbacks: callbacks}, nil
}

// Exporter returns exporter by name
func (w *ContainerdWorker) Exporter(name string, sm *session.Manager) (bkexporter.Exporter, error) {
        switch name {
        case exporter.Moby:
                exp, err := w.Worker.Exporter(client.ExporterImage, sm)
                if err != nil {
                        return nil, err
                }
                return exporter.NewWrapper(exp, w.callbacks)
        default:
                return w.Worker.Exporter(name, sm)
        }
}

package worker

import (
        "math"
        "time"

        "github.com/moby/buildkit/client"
        "github.com/moby/buildkit/util/disk"
)

const (
        defaultReservedSpaceBytes      int64 = 2e9 // 2GB
        defaultReservedSpacePercentage int64 = 10
        defaultMaxUsedPercentage       int64 = 80
        defaultMinFreePercentage       int64 = 20
)

// tempCachePercent represents the percentage ratio of the cache size in bytes to temporarily keep for a short period of time (couple of days)
// over the total cache size in bytes. Because there is no perfect value, a mathematically pleasing one was chosen.
// The value is approximately 13.8
const tempCachePercent = math.E * math.Pi * math.Phi

// DefaultGCPolicy returns a default builder GC policy
func DefaultGCPolicy(p string, reservedSpace, maxUsedSpace, minFreeSpace int64) []client.PruneInfo {
        if reservedSpace == 0 && maxUsedSpace == 0 && minFreeSpace == 0 {
                // Only check the disk if we need to fill in an inferred value.
                if dstat, err := disk.GetDiskStat(p); err == nil {
                        // Fill in default values only if we can read the disk.
                        reservedSpace = diskPercentage(dstat, defaultReservedSpacePercentage)
                        maxUsedSpace = diskPercentage(dstat, defaultMaxUsedPercentage)
                        minFreeSpace = diskPercentage(dstat, defaultMinFreePercentage)
                } else {
                        // Fill in only reserved space if we cannot read the disk.
                        reservedSpace = defaultReservedSpaceBytes
                }
        }

        tempCacheReservedSpace := int64(math.Round(float64(reservedSpace) / 100. * float64(tempCachePercent)))
        const minTempCacheReservedSpace = 512 * 1e6 // 512MB
        if tempCacheReservedSpace < minTempCacheReservedSpace {
                tempCacheReservedSpace = minTempCacheReservedSpace
        }

        return []client.PruneInfo{
                // if build cache uses more than 512MB delete the most easily reproducible data after it has not been used for 2 days
                {
                        Filter:       []string{"type==source.local,type==exec.cachemount,type==source.git.checkout"},
                        KeepDuration: 48 * time.Hour,
                        MaxUsedSpace: tempCacheReservedSpace,
                },
                // remove any data not used for 60 days
                {
                        KeepDuration:  60 * 24 * time.Hour,
                        ReservedSpace: reservedSpace,
                        MaxUsedSpace:  maxUsedSpace,
                        MinFreeSpace:  minFreeSpace,
                },
                // keep the unshared build cache under cap
                {
                        ReservedSpace: reservedSpace,
                        MaxUsedSpace:  maxUsedSpace,
                        MinFreeSpace:  minFreeSpace,
                },
                // if previous policies were insufficient start deleting internal data to keep build cache under cap
                {
                        All:           true,
                        ReservedSpace: reservedSpace,
                        MaxUsedSpace:  maxUsedSpace,
                        MinFreeSpace:  minFreeSpace,
                },
        }
}

func diskPercentage(dstat disk.DiskStat, percentage int64) int64 {
        avail := dstat.Total / percentage
        return (avail/(1<<30) + 1) * 1e9 // round up
}

package mod

import (
        "runtime/debug"
        "sync"

        "golang.org/x/mod/module"
        "golang.org/x/mod/semver"
)

var (
        buildInfoOnce sync.Once
        buildInfo     *debug.BuildInfo
)

func Version(name string) (modVersion string) {
        return moduleVersion(name, readBuildInfo())
}

func moduleVersion(name string, bi *debug.BuildInfo) (modVersion string) {
        if bi == nil {
                return ""
        }
        // iterate over all dependencies and find buildkit
        for _, dep := range bi.Deps {
                if dep.Path != name {
                        continue
                }
                // get the version of buildkit dependency
                modVersion = dep.Version
                if dep.Replace != nil {
                        // if the version is replaced, get the replaced version
                        modVersion = dep.Replace.Version
                }
                if !module.IsPseudoVersion(modVersion) {
                        return modVersion
                }
                // if the version is a pseudo version, get the base version
                // e.g. v0.10.7-0.20230306143919-70f2ad56d3e5 => v0.10.6
                if base, err := module.PseudoVersionBase(modVersion); err == nil && base != "" {
                        // set canonical version of the base version (removes +incompatible suffix)
                        // e.g. v2.1.2+incompatible => v2.1.2
                        base = semver.Canonical(base)
                        // if the version is a pseudo version, get the revision
                        // e.g. v0.10.7-0.20230306143919-70f2ad56d3e5 => 70f2ad56d3e5
                        if rev, err := module.PseudoVersionRev(modVersion); err == nil && rev != "" {
                                // append the revision to the version
                                // e.g. v0.10.7-0.20230306143919-70f2ad56d3e5 => v0.10.6+70f2ad56d3e5
                                modVersion = base + "+" + rev
                        } else {
                                // if the revision is not available, use the base version
                                modVersion = base
                        }
                }
                break
        }
        return modVersion
}

func readBuildInfo() *debug.BuildInfo {
        buildInfoOnce.Do(func() {
                buildInfo, _ = debug.ReadBuildInfo()
        })
        return buildInfo
}

package worker

import (
        "context"
        "fmt"
        "io"
        nethttp "net/http"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        c8dimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/gc"
        "github.com/containerd/containerd/v2/pkg/rootfs"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        imageadapter "github.com/docker/docker/daemon/internal/builder-next/adapters/containerimage"
        mobyexporter "github.com/docker/docker/daemon/internal/builder-next/exporter"
        "github.com/docker/docker/daemon/internal/builder-next/worker/mod"
        distmetadata "github.com/docker/docker/distribution/metadata"
        "github.com/docker/docker/distribution/xfer"
        "github.com/docker/docker/layer"
        pkgprogress "github.com/docker/docker/pkg/progress"
        "github.com/moby/buildkit/cache"
        cacheconfig "github.com/moby/buildkit/cache/config"
        "github.com/moby/buildkit/client"
        "github.com/moby/buildkit/client/llb/sourceresolver"
        "github.com/moby/buildkit/executor"
        "github.com/moby/buildkit/exporter"
        localexporter "github.com/moby/buildkit/exporter/local"
        tarexporter "github.com/moby/buildkit/exporter/tar"
        "github.com/moby/buildkit/frontend"
        "github.com/moby/buildkit/session"
        "github.com/moby/buildkit/snapshot"
        containerdsnapshot "github.com/moby/buildkit/snapshot/containerd"
        "github.com/moby/buildkit/solver"
        "github.com/moby/buildkit/solver/llbsolver/cdidevices"
        "github.com/moby/buildkit/solver/llbsolver/mounts"
        "github.com/moby/buildkit/solver/llbsolver/ops"
        "github.com/moby/buildkit/solver/pb"
        "github.com/moby/buildkit/source"
        "github.com/moby/buildkit/source/containerimage"
        "github.com/moby/buildkit/source/git"
        "github.com/moby/buildkit/source/http"
        "github.com/moby/buildkit/source/local"
        "github.com/moby/buildkit/util/archutil"
        "github.com/moby/buildkit/util/contentutil"
        "github.com/moby/buildkit/util/leaseutil"
        "github.com/moby/buildkit/util/progress"
        "github.com/moby/buildkit/version"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
        "golang.org/x/sync/semaphore"
)

func init() {
        if v := mod.Version("github.com/moby/buildkit"); v != "" {
                version.Version = v
        }
}

const labelCreatedAt = "buildkit/createdat"

// LayerAccess provides access to a moby layer from a snapshot
type LayerAccess interface {
        GetDiffIDs(ctx context.Context, key string) ([]layer.DiffID, error)
        EnsureLayer(ctx context.Context, key string) ([]layer.DiffID, error)
}

// Opt defines a structure for creating a worker.
type Opt struct {
        ID                string
        Labels            map[string]string
        GCPolicy          []client.PruneInfo
        Executor          executor.Executor
        Snapshotter       snapshot.Snapshotter
        ContentStore      *containerdsnapshot.Store
        CacheManager      cache.Manager
        LeaseManager      *leaseutil.Manager
        GarbageCollect    func(context.Context) (gc.Stats, error)
        ImageSource       *imageadapter.Source
        DownloadManager   *xfer.LayerDownloadManager
        V2MetadataService distmetadata.V2MetadataService
        Transport         nethttp.RoundTripper
        Exporter          exporter.Exporter
        Layers            LayerAccess
        Platforms         []ocispec.Platform
        CDIManager        *cdidevices.Manager
}

// Worker is a local worker instance with dedicated snapshotter, cache, and so on.
// TODO: s/Worker/OpWorker/g ?
type Worker struct {
        Opt
        SourceManager *source.Manager
}

var _ interface {
        GetRemotes(context.Context, cache.ImmutableRef, bool, cacheconfig.RefConfig, bool, session.Group) ([]*solver.Remote, error)
} = &Worker{}

// NewWorker instantiates a local worker
func NewWorker(opt Opt) (*Worker, error) {
        sm, err := source.NewManager()
        if err != nil {
                return nil, err
        }

        cm := opt.CacheManager
        sm.Register(opt.ImageSource)

        gs, err := git.NewSource(git.Opt{
                CacheAccessor: cm,
        })
        if err == nil {
                sm.Register(gs)
        } else {
                log.G(context.TODO()).Warnf("Could not register builder git source: %s", err)
        }

        hs, err := http.NewSource(http.Opt{
                CacheAccessor: cm,
                Transport:     opt.Transport,
        })
        if err == nil {
                sm.Register(hs)
        } else {
                log.G(context.TODO()).Warnf("Could not register builder http source: %s", err)
        }

        ss, err := local.NewSource(local.Opt{
                CacheAccessor: cm,
        })
        if err == nil {
                sm.Register(ss)
        } else {
                log.G(context.TODO()).Warnf("Could not register builder local source: %s", err)
        }

        return &Worker{
                Opt:           opt,
                SourceManager: sm,
        }, nil
}

// ID returns worker ID
func (w *Worker) ID() string {
        return w.Opt.ID
}

// Labels returns map of all worker labels
func (w *Worker) Labels() map[string]string {
        return w.Opt.Labels
}

// Platforms returns one or more platforms supported by the image.
func (w *Worker) Platforms(noCache bool) []ocispec.Platform {
        if noCache {
                w.Opt.Platforms = mergePlatforms(w.Opt.Platforms, archutil.SupportedPlatforms(noCache))
        }
        if len(w.Opt.Platforms) == 0 {
                return []ocispec.Platform{platforms.DefaultSpec()}
        }
        return w.Opt.Platforms
}

// mergePlatforms merges the defined platforms with the supported platforms
// and returns a new slice of platforms. It ensures no duplicates.
func mergePlatforms(defined, supported []ocispec.Platform) []ocispec.Platform {
        result := []ocispec.Platform{}
        matchers := make([]platforms.MatchComparer, len(defined))
        for i, p := range defined {
                result = append(result, p)
                matchers[i] = platforms.Only(p)
        }
        for _, p := range supported {
                exists := false
                for _, m := range matchers {
                        if m.Match(p) {
                                exists = true
                                break
                        }
                }
                if !exists {
                        result = append(result, p)
                }
        }
        return result
}

// GCPolicy returns automatic GC Policy
func (w *Worker) GCPolicy() []client.PruneInfo {
        return w.Opt.GCPolicy
}

// BuildkitVersion returns BuildKit version
func (w *Worker) BuildkitVersion() client.BuildkitVersion {
        return client.BuildkitVersion{
                Package:  version.Package,
                Version:  version.Version + "-moby",
                Revision: version.Revision,
        }
}

func (w *Worker) GarbageCollect(ctx context.Context) error {
        if w.Opt.GarbageCollect == nil {
                return nil
        }
        _, err := w.Opt.GarbageCollect(ctx)
        return err
}

// Close closes the worker and releases all resources
func (w *Worker) Close() error {
        return nil
}

// ContentStore returns the wrapped content store
func (w *Worker) ContentStore() *containerdsnapshot.Store {
        return w.Opt.ContentStore
}

// LeaseManager returns the wrapped lease manager
func (w *Worker) LeaseManager() *leaseutil.Manager {
        return w.Opt.LeaseManager
}

// LoadRef loads a reference by ID
func (w *Worker) LoadRef(ctx context.Context, id string, hidden bool) (cache.ImmutableRef, error) {
        var opts []cache.RefOption
        if hidden {
                opts = append(opts, cache.NoUpdateLastUsed)
        }
        if id == "" {
                // results can have nil refs if they are optimized out to be equal to scratch,
                // i.e. Diff(A,A) == scratch
                return nil, nil
        }

        return w.CacheManager().Get(ctx, id, nil, opts...)
}

func (w *Worker) ResolveSourceMetadata(ctx context.Context, op *pb.SourceOp, opt sourceresolver.Opt, sm *session.Manager, g session.Group) (*sourceresolver.MetaResponse, error) {
        if opt.SourcePolicies != nil {
                return nil, errors.New("source policies can not be set for worker")
        }

        var platform *pb.Platform
        if p := opt.Platform; p != nil {
                platform = &pb.Platform{
                        Architecture: p.Architecture,
                        OS:           p.OS,
                        Variant:      p.Variant,
                        OSVersion:    p.OSVersion,
                }
        }

        id, err := w.SourceManager.Identifier(&pb.Op_Source{Source: op}, platform)
        if err != nil {
                return nil, err
        }

        switch idt := id.(type) {
        case *containerimage.ImageIdentifier:
                if opt.ImageOpt == nil {
                        opt.ImageOpt = &sourceresolver.ResolveImageOpt{}
                }
                dgst, config, err := w.ImageSource.ResolveImageConfig(ctx, idt.Reference.String(), opt, sm, g)
                if err != nil {
                        return nil, err
                }
                return &sourceresolver.MetaResponse{
                        Op: op,
                        Image: &sourceresolver.ResolveImageResponse{
                                Digest: dgst,
                                Config: config,
                        },
                }, nil
        }

        return &sourceresolver.MetaResponse{
                Op: op,
        }, nil
}

// ResolveOp converts a LLB vertex into a LLB operation
func (w *Worker) ResolveOp(v solver.Vertex, s frontend.FrontendLLBBridge, sm *session.Manager) (solver.Op, error) {
        if baseOp, ok := v.Sys().(*pb.Op); ok {
                // TODO do we need to pass a value here? Where should it come from? https://github.com/moby/buildkit/commit/b3cf7c43cfefdfd7a945002c0e76b54e346ab6cf
                var parallelism *semaphore.Weighted
                switch op := baseOp.Op.(type) {
                case *pb.Op_Source:
                        return ops.NewSourceOp(v, op, baseOp.Platform, w.SourceManager, parallelism, sm, w)
                case *pb.Op_Exec:
                        return ops.NewExecOp(v, op, baseOp.Platform, w.CacheManager(), parallelism, sm, w.Executor(), w)
                case *pb.Op_File:
                        return ops.NewFileOp(v, op, w.CacheManager(), parallelism, w)
                case *pb.Op_Build:
                        return ops.NewBuildOp(v, op, s, w)
                case *pb.Op_Merge:
                        return ops.NewMergeOp(v, op, w)
                case *pb.Op_Diff:
                        return ops.NewDiffOp(v, op, w)
                }
        }
        return nil, errors.Errorf("could not resolve %v", v)
}

// ResolveImageConfig returns image config for an image
func (w *Worker) ResolveImageConfig(ctx context.Context, ref string, opt sourceresolver.Opt, sm *session.Manager, g session.Group) (digest.Digest, []byte, error) {
        return w.ImageSource.ResolveImageConfig(ctx, ref, opt, sm, g)
}

// DiskUsage returns disk usage report
func (w *Worker) DiskUsage(ctx context.Context, opt client.DiskUsageInfo) ([]*client.UsageInfo, error) {
        return w.CacheManager().DiskUsage(ctx, opt)
}

// Prune deletes reclaimable build cache
func (w *Worker) Prune(ctx context.Context, ch chan client.UsageInfo, info ...client.PruneInfo) error {
        return w.CacheManager().Prune(ctx, ch, info...)
}

// Exporter returns exporter by name
func (w *Worker) Exporter(name string, sm *session.Manager) (exporter.Exporter, error) {
        switch name {
        case mobyexporter.Moby:
                return w.Opt.Exporter, nil
        case client.ExporterLocal:
                return localexporter.New(localexporter.Opt{
                        SessionManager: sm,
                })
        case client.ExporterTar:
                return tarexporter.New(tarexporter.Opt{
                        SessionManager: sm,
                })
        default:
                return nil, errors.Errorf("exporter %q could not be found", name)
        }
}

// GetRemotes returns the remote snapshot references given a local reference
func (w *Worker) GetRemotes(ctx context.Context, ref cache.ImmutableRef, createIfNeeded bool, _ cacheconfig.RefConfig, all bool, s session.Group) ([]*solver.Remote, error) {
        if ref == nil {
                return nil, nil
        }
        var diffIDs []layer.DiffID
        var err error
        if !createIfNeeded {
                diffIDs, err = w.Layers.GetDiffIDs(ctx, ref.ID())
                if err != nil {
                        return nil, err
                }
        } else {
                if err := ref.Finalize(ctx); err != nil {
                        return nil, err
                }
                if err := ref.Extract(ctx, s); err != nil {
                        return nil, err
                }
                diffIDs, err = w.Layers.EnsureLayer(ctx, ref.ID())
                if err != nil {
                        return nil, err
                }
        }

        descriptors := make([]ocispec.Descriptor, len(diffIDs))
        for i, dgst := range diffIDs {
                descriptors[i] = ocispec.Descriptor{
                        MediaType: c8dimages.MediaTypeDockerSchema2Layer,
                        Digest:    dgst,
                        Size:      -1,
                }
        }

        return []*solver.Remote{{
                Descriptors: descriptors,
                Provider:    &emptyProvider{},
        }}, nil
}

// PruneCacheMounts removes the current cache snapshots for specified IDs
func (w *Worker) PruneCacheMounts(ctx context.Context, ids map[string]bool) error {
        mu := mounts.CacheMountsLocker()
        mu.Lock()
        defer mu.Unlock()

        for id, nested := range ids {
                mds, err := mounts.SearchCacheDir(ctx, w.CacheManager(), id, nested)
                if err != nil {
                        return err
                }
                for _, md := range mds {
                        if err := md.SetCachePolicyDefault(); err != nil {
                                return err
                        }
                        if err := md.ClearCacheDirIndex(); err != nil {
                                return err
                        }
                        // if ref is unused try to clean it up right away by releasing it
                        if mref, err := w.CacheManager().GetMutable(ctx, md.ID()); err == nil {
                                go mref.Release(context.TODO())
                        }
                }
        }

        mounts.ClearActiveCacheMounts()
        return nil
}

func (w *Worker) getRef(ctx context.Context, diffIDs []layer.DiffID, opts ...cache.RefOption) (cache.ImmutableRef, error) {
        var parent cache.ImmutableRef
        if len(diffIDs) > 1 {
                var err error
                parent, err = w.getRef(ctx, diffIDs[:len(diffIDs)-1], opts...)
                if err != nil {
                        return nil, err
                }
                defer parent.Release(context.TODO())
        }
        return w.CacheManager().GetByBlob(context.TODO(), ocispec.Descriptor{
                Annotations: map[string]string{
                        "containerd.io/uncompressed": diffIDs[len(diffIDs)-1].String(),
                },
        }, parent, opts...)
}

// FromRemote converts a remote snapshot reference to a local one
func (w *Worker) FromRemote(ctx context.Context, remote *solver.Remote) (cache.ImmutableRef, error) {
        rootfs, err := getLayers(ctx, remote.Descriptors)
        if err != nil {
                return nil, err
        }

        layers := make([]xfer.DownloadDescriptor, 0, len(rootfs))

        for _, l := range rootfs {
                // ongoing.add(desc)
                layers = append(layers, &layerDescriptor{
                        desc:     l.Blob,
                        diffID:   l.Diff.Digest,
                        provider: remote.Provider,
                        w:        w,
                        pctx:     ctx,
                })
        }

        defer func() {
                for _, l := range rootfs {
                        w.ContentStore().Delete(context.TODO(), l.Blob.Digest)
                }
        }()

        rootFS, release, err := w.DownloadManager.Download(ctx, layers, &discardProgress{})
        if err != nil {
                return nil, err
        }
        defer release()

        if len(rootFS.DiffIDs) != len(layers) {
                return nil, errors.Errorf("invalid layer count mismatch %d vs %d", len(rootFS.DiffIDs), len(layers))
        }

        for i := range rootFS.DiffIDs {
                tm := time.Now()
                if tmstr, ok := remote.Descriptors[i].Annotations[labelCreatedAt]; ok {
                        if err := (&tm).UnmarshalText([]byte(tmstr)); err != nil {
                                return nil, err
                        }
                }
                descr := fmt.Sprintf("imported %s", remote.Descriptors[i].Digest)
                if v, ok := remote.Descriptors[i].Annotations["buildkit/description"]; ok {
                        descr = v
                }
                ref, err := w.getRef(ctx, rootFS.DiffIDs[:i+1], cache.WithDescription(descr), cache.WithCreationTime(tm))
                if err != nil {
                        return nil, err
                }
                if i == len(remote.Descriptors)-1 {
                        return ref, nil
                }
                defer ref.Release(context.TODO())
        }

        return nil, errors.Errorf("unreachable")
}

// Executor returns executor.Executor for running processes
func (w *Worker) Executor() executor.Executor {
        return w.Opt.Executor
}

// CacheManager returns cache.Manager for accessing local storage
func (w *Worker) CacheManager() cache.Manager {
        return w.Opt.CacheManager
}

func (w *Worker) CDIManager() *cdidevices.Manager {
        return w.Opt.CDIManager
}

type discardProgress struct{}

func (*discardProgress) WriteProgress(_ pkgprogress.Progress) error {
        return nil
}

// Fetch(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error)
type layerDescriptor struct {
        provider content.Provider
        desc     ocispec.Descriptor
        diffID   layer.DiffID
        // ref      ctdreference.Spec
        w    *Worker
        pctx context.Context
}

func (ld *layerDescriptor) Key() string {
        return "v2:" + ld.desc.Digest.String()
}

func (ld *layerDescriptor) ID() string {
        return ld.desc.Digest.String()
}

func (ld *layerDescriptor) DiffID() (layer.DiffID, error) {
        return ld.diffID, nil
}

func (ld *layerDescriptor) Download(ctx context.Context, progressOutput pkgprogress.Output) (io.ReadCloser, int64, error) {
        done := oneOffProgress(ld.pctx, fmt.Sprintf("pulling %s", ld.desc.Digest))

        // TODO should this write output to progressOutput? Or use something similar to loggerFromContext()? see https://github.com/moby/buildkit/commit/aa29e7729464f3c2a773e27795e584023c751cb8
        discardLogs := func(_ []byte) {}
        if err := contentutil.Copy(ctx, ld.w.ContentStore(), ld.provider, ld.desc, "", discardLogs); err != nil {
                return nil, 0, done(err)
        }
        _ = done(nil)

        ra, err := ld.w.ContentStore().ReaderAt(ctx, ld.desc)
        if err != nil {
                return nil, 0, err
        }

        return io.NopCloser(content.NewReader(ra)), ld.desc.Size, nil
}

func (ld *layerDescriptor) Close() {
        // ld.is.ContentStore().Delete(context.TODO(), ld.desc.Digest)
}

func (ld *layerDescriptor) Registered(diffID layer.DiffID) {
        // Cache mapping from this layer's DiffID to the blobsum
        ld.w.V2MetadataService.Add(diffID, distmetadata.V2Metadata{Digest: ld.desc.Digest})
}

func getLayers(ctx context.Context, descs []ocispec.Descriptor) ([]rootfs.Layer, error) {
        layers := make([]rootfs.Layer, len(descs))
        for i, desc := range descs {
                diffIDStr := desc.Annotations["containerd.io/uncompressed"]
                if diffIDStr == "" {
                        return nil, errors.Errorf("%s missing uncompressed digest", desc.Digest)
                }
                diffID, err := digest.Parse(diffIDStr)
                if err != nil {
                        return nil, err
                }
                layers[i].Diff = ocispec.Descriptor{
                        MediaType: ocispec.MediaTypeImageLayer,
                        Digest:    diffID,
                }
                layers[i].Blob = ocispec.Descriptor{
                        MediaType: desc.MediaType,
                        Digest:    desc.Digest,
                        Size:      desc.Size,
                }
        }
        return layers, nil
}

func oneOffProgress(ctx context.Context, id string) func(err error) error {
        pw, _, _ := progress.NewFromContext(ctx)
        s := time.Now()
        st := progress.Status{
                Started: &s,
        }
        _ = pw.Write(id, st)
        return func(err error) error {
                // TODO: set error on status
                c := time.Now()
                st.Completed = &c
                _ = pw.Write(id, st)
                _ = pw.Close()
                return err
        }
}

type emptyProvider struct{}

func (p *emptyProvider) ReaderAt(ctx context.Context, dec ocispec.Descriptor) (content.ReaderAt, error) {
        return nil, errors.Errorf("ReaderAt not implemented for empty provider")
}

func (p *emptyProvider) Info(ctx context.Context, d digest.Digest) (content.Info, error) {
        return content.Info{}, errors.Wrapf(cerrdefs.ErrNotImplemented, "Info not implemented for empty provider")
}

package types

import (
        "time"

        "github.com/opencontainers/runtime-spec/specs-go"
)

// Summary is not used on linux
type Summary struct{}

// Stats holds metrics properties as returned by containerd
type Stats struct {
        Read time.Time
        // Metrics is expected to be either one of:
        // * github.com/containerd/cgroups/v3/cgroup1/stats.Metrics
        // * github.com/containerd/cgroups/v3/cgroup2/stats.Metrics
        Metrics interface{}
}

// InterfaceToStats returns a stats object from the platform-specific interface.
func InterfaceToStats(read time.Time, v interface{}) *Stats {
        return &Stats{
                Metrics: v,
                Read:    read,
        }
}

// Resources defines updatable container resource values. TODO: it must match containerd upcoming API
type Resources = specs.LinuxResources

// Checkpoints contains the details of a checkpoint
type Checkpoints struct{}

package metrics

import (
        "sync"

        gometrics "github.com/docker/go-metrics"
        "github.com/prometheus/client_golang/prometheus"
)

var (
        metricsNS = gometrics.NewNamespace("engine", "daemon", nil)

        // ContainerActions tracks the time taken to process container operations
        ContainerActions = metricsNS.NewLabeledTimer("container_actions", "The number of seconds it takes to process each container action", "action")
        // NetworkActions tracks the time taken to process network operations
        NetworkActions = metricsNS.NewLabeledTimer("network_actions", "The number of seconds it takes to process each network action", "action")
        // HostInfoFunctions tracks the time taken to gather host information
        HostInfoFunctions = metricsNS.NewLabeledTimer("host_info_functions", "The number of seconds it takes to call functions gathering info about the host", "function")
        ImageActions      = metricsNS.NewLabeledTimer("image_actions", "The number of seconds it takes to process each image action", "action")

        // EngineInfo provides information about the engine and its environment
        EngineInfo = metricsNS.NewLabeledGauge("engine", "The information related to the engine and the OS it is running on", gometrics.Unit("info"),
                "version",
                "commit",
                "architecture",
                "graphdriver",
                "kernel",
                "os",
                "os_type",
                "os_version",
                "daemon_id",
        )
        // EngineCPUs tracks the number of CPUs available to the engine
        EngineCPUs = metricsNS.NewGauge("engine_cpus", "The number of cpus that the host system of the engine has", gometrics.Unit("cpus"))
        // EngineMemory tracks the amount of memory available to the engine
        EngineMemory = metricsNS.NewGauge("engine_memory", "The number of bytes of memory that the host system of the engine has", gometrics.Bytes)

        // HealthChecksCounter tracks the total number of health checks
        HealthChecksCounter = metricsNS.NewCounter("health_checks", "The total number of health checks")
        // HealthChecksFailedCounter tracks the number of failed health checks
        HealthChecksFailedCounter = metricsNS.NewCounter("health_checks_failed", "The total number of failed health checks")
        // HealthCheckStartDuration tracks the time taken to prepare health checks
        HealthCheckStartDuration = metricsNS.NewTimer("health_check_start_duration", "The number of seconds it takes to prepare to run health checks")

        // StateCtr tracks container states
        StateCtr = newStateCounter(metricsNS, metricsNS.NewDesc("container_states", "The count of containers in various states", gometrics.Unit("containers"), "state"))

        // EventsCounter tracks the number of events logged
        EventsCounter = metricsNS.NewCounter("events", "The number of events logged")

        // EventSubscribers tracks the number of current subscribers to events
        EventSubscribers = metricsNS.NewGauge("events_subscribers", "The number of current subscribers to events", gometrics.Total)
)

func init() {
        for _, a := range []string{
                "start",
                "changes",
                "commit",
                "create",
                "delete",
        } {
                ContainerActions.WithValues(a).Update(0)
        }

        gometrics.Register(metricsNS)
}

func StartTimer(t gometrics.Timer) func() {
        return gometrics.StartTimer(t)
}

// StateCounter tracks container states
type StateCounter struct {
        mu     sync.RWMutex
        states map[string]string
        desc   *prometheus.Desc
}

func newStateCounter(ns *gometrics.Namespace, desc *prometheus.Desc) *StateCounter {
        c := &StateCounter{
                states: make(map[string]string),
                desc:   desc,
        }
        ns.Add(c)
        return c
}

// Get returns the count of containers in running, paused, and stopped states
func (ctr *StateCounter) Get() (running int, paused int, stopped int) {
        ctr.mu.RLock()
        defer ctr.mu.RUnlock()

        for _, state := range ctr.states {
                switch state {
                case "running":
                        running++
                case "paused":
                        paused++
                case "stopped":
                        stopped++
                }
        }
        return running, paused, stopped
}

// Set updates the state for a container
func (ctr *StateCounter) Set(id, label string) {
        ctr.mu.Lock()
        defer ctr.mu.Unlock()

        ctr.states[id] = label
}

// Delete removes a container's state
func (ctr *StateCounter) Delete(id string) {
        ctr.mu.Lock()
        defer ctr.mu.Unlock()

        delete(ctr.states, id)
}

// Describe implements prometheus.Collector
func (ctr *StateCounter) Describe(ch chan<- *prometheus.Desc) {
        ch <- ctr.desc
}

// Collect implements prometheus.Collector
func (ctr *StateCounter) Collect(ch chan<- prometheus.Metric) {
        running, paused, stopped := ctr.Get()
        ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(running), "running")
        ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(paused), "paused")
        ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(stopped), "stopped")
}

//go:build !windows

package metrics

import (
        "context"
        "net"
        "net/http"
        "os"
        "strings"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/pkg/plugin"
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
        gometrics "github.com/docker/go-metrics"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/pkg/errors"
)

const pluginType = "MetricsCollector"

// Plugin represents a metrics collector plugin
type Plugin interface {
        StartMetrics() error
        StopMetrics() error
}

type metricsPluginAdapter struct {
        client *plugins.Client
}

func (a *metricsPluginAdapter) StartMetrics() error {
        return a.client.Call("/MetricsCollector.StartMetrics", nil, nil)
}

func (a *metricsPluginAdapter) StopMetrics() error {
        return a.client.Call("/MetricsCollector.StopMetrics", nil, nil)
}

func makePluginAdapter(p plugingetter.CompatPlugin) (Plugin, error) {
        adapted := p.Client()
        return &metricsPluginAdapter{adapted}, nil
}

// RegisterPlugin starts the metrics server listener and registers the metrics plugin
// callback with the plugin store
func RegisterPlugin(store *plugin.Store, path string) error {
        if err := listen(path); err != nil {
                return err
        }

        store.RegisterRuntimeOpt(pluginType, func(s *specs.Spec) {
                f := plugin.WithSpecMounts([]specs.Mount{
                        {Type: "bind", Source: path, Destination: "/run/docker/metrics.sock", Options: []string{"bind", "ro"}},
                })
                f(s)
        })
        store.Handle(pluginType, func(name string, client *plugins.Client) {
                // Use lookup since nothing in the system can really reference it, no need
                // to protect against removal
                p, err := store.Get(name, pluginType, plugingetter.Lookup)
                if err != nil {
                        return
                }

                adapter, err := makePluginAdapter(p)
                if err != nil {
                        log.G(context.TODO()).WithError(err).WithField("plugin", p.Name()).Error("Error creating plugin adapter")
                }
                if err := adapter.StartMetrics(); err != nil {
                        log.G(context.TODO()).WithError(err).WithField("plugin", p.Name()).Error("Error starting metrics collector plugin")
                }
        })

        return nil
}

// CleanupPlugin stops metrics collection for all plugins
func CleanupPlugin(store plugingetter.PluginGetter) {
        ls := store.GetAllManagedPluginsByCap(pluginType)
        var wg sync.WaitGroup
        wg.Add(len(ls))

        for _, plugin := range ls {
                p := plugin
                go func() {
                        defer wg.Done()

                        adapter, err := makePluginAdapter(p)
                        if err != nil {
                                log.G(context.TODO()).WithError(err).WithField("plugin", p.Name()).Error("Error creating metrics plugin adapter")
                                return
                        }
                        if err := adapter.StopMetrics(); err != nil {
                                log.G(context.TODO()).WithError(err).WithField("plugin", p.Name()).Error("Error stopping plugin metrics collection")
                        }
                }()
        }
        wg.Wait()

        if listener != nil {
                _ = listener.Close()
        }
}

var listener net.Listener

func listen(path string) error {
        _ = os.Remove(path)
        l, err := net.Listen("unix", path)
        if err != nil {
                return errors.Wrap(err, "error setting up metrics plugin listener")
        }

        mux := http.NewServeMux()
        mux.Handle("/metrics", gometrics.Handler())
        go func() {
                log.G(context.TODO()).Debugf("metrics API listening on %s", l.Addr())
                srv := &http.Server{
                        Handler:           mux,
                        ReadHeaderTimeout: 5 * time.Minute, // "G112: Potential Slowloris Attack (gosec)"; not a real concern for our use, so setting a long timeout.
                }
                if err := srv.Serve(l); err != nil && !strings.Contains(err.Error(), "use of closed network connection") {
                        log.G(context.TODO()).WithError(err).Error("error serving metrics API")
                }
        }()
        listener = l
        return nil
}

package restartmanager

import (
        "errors"
        "sync"
        "time"

        "github.com/moby/moby/api/types/container"
)

const (
        backoffMultiplier = 2
        defaultTimeout    = 100 * time.Millisecond
        maxRestartTimeout = 1 * time.Minute
)

// ErrRestartCanceled is returned when the restart manager has been
// canceled and will no longer restart the container.
var ErrRestartCanceled = errors.New("restart canceled")

// RestartManager defines object that controls container restarting rules.
type RestartManager struct {
        sync.Mutex
        sync.Once
        policy       container.RestartPolicy
        restartCount int
        timeout      time.Duration
        active       bool
        cancel       chan struct{}
        canceled     bool
}

// New returns a new RestartManager based on a policy.
func New(policy container.RestartPolicy, restartCount int) *RestartManager {
        return &RestartManager{policy: policy, restartCount: restartCount, cancel: make(chan struct{})}
}

// SetPolicy sets the restart-policy for the RestartManager.
func (rm *RestartManager) SetPolicy(policy container.RestartPolicy) {
        rm.Lock()
        rm.policy = policy
        rm.Unlock()
}

// ShouldRestart returns whether the container should be restarted.
func (rm *RestartManager) ShouldRestart(exitCode uint32, hasBeenManuallyStopped bool, executionDuration time.Duration) (bool, chan error, error) {
        if rm.policy.IsNone() {
                return false, nil, nil
        }
        rm.Lock()
        unlockOnExit := true
        defer func() {
                if unlockOnExit {
                        rm.Unlock()
                }
        }()

        if rm.canceled {
                return false, nil, ErrRestartCanceled
        }

        if rm.active {
                return false, nil, errors.New("invalid call on an active restart manager")
        }
        // if the container ran for more than 10s, regardless of status and policy reset
        // the timeout back to the default.
        if executionDuration.Seconds() >= 10 {
                rm.timeout = 0
        }
        switch {
        case rm.timeout == 0:
                rm.timeout = defaultTimeout
        case rm.timeout < maxRestartTimeout:
                rm.timeout *= backoffMultiplier
        }
        if rm.timeout > maxRestartTimeout {
                rm.timeout = maxRestartTimeout
        }

        var restart bool
        switch {
        case rm.policy.IsAlways():
                restart = true
        case rm.policy.IsUnlessStopped() && !hasBeenManuallyStopped:
                restart = true
        case rm.policy.IsOnFailure():
                // the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
                if maxRetryCount := rm.policy.MaximumRetryCount; maxRetryCount == 0 || rm.restartCount < maxRetryCount {
                        restart = exitCode != 0
                }
        }

        if !restart {
                rm.active = false
                return false, nil, nil
        }

        rm.restartCount++

        unlockOnExit = false
        rm.active = true
        rm.Unlock()

        ch := make(chan error)
        go func() {
                timeout := time.NewTimer(rm.timeout)
                defer timeout.Stop()

                select {
                case <-rm.cancel:
                        ch <- ErrRestartCanceled
                        close(ch)
                case <-timeout.C:
                        rm.Lock()
                        close(ch)
                        rm.active = false
                        rm.Unlock()
                }
        }()

        return true, ch, nil
}

// Cancel tells the RestartManager to no longer restart the container.
func (rm *RestartManager) Cancel() {
        rm.Do(func() {
                rm.Lock()
                rm.canceled = true
                close(rm.cancel)
                rm.Unlock()
        })
}

package stream

import (
        "context"
        "io"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/pools"
        "github.com/moby/term"
        "github.com/pkg/errors"
        "golang.org/x/sync/errgroup"
)

var defaultEscapeSequence = []byte{16, 17} // ctrl-p, ctrl-q

// AttachConfig is the config struct used to attach a client to a stream's stdio
type AttachConfig struct {
        // Tells the attach copier that the stream's stdin is a TTY and to look for
        // escape sequences in stdin to detach from the stream.
        // When true the escape sequence is not passed to the underlying stream
        TTY bool
        // Specifies the detach keys the client will be using
        // Only useful when `TTY` is true
        DetachKeys []byte

        // CloseStdin signals that once done, stdin for the attached stream should be closed
        // For example, this would close the attached container's stdin.
        CloseStdin bool

        // UseStd* indicate whether the client has requested to be connected to the
        // given stream or not.  These flags are used instead of checking Std* != nil
        // at points before the client streams Std* are wired up.
        UseStdin, UseStdout, UseStderr bool

        // CStd* are the streams directly connected to the container
        CStdin           io.WriteCloser
        CStdout, CStderr io.ReadCloser

        // Provide client streams to wire up to
        Stdin          io.ReadCloser
        Stdout, Stderr io.Writer
}

// AttachStreams attaches the container's streams to the AttachConfig
func (c *Config) AttachStreams(cfg *AttachConfig) {
        if cfg.UseStdin {
                cfg.CStdin = c.StdinPipe()
        }

        if cfg.UseStdout {
                cfg.CStdout = c.StdoutPipe()
        }

        if cfg.UseStderr {
                cfg.CStderr = c.StderrPipe()
        }
}

// CopyStreams starts goroutines to copy data in and out to/from the container
func (c *Config) CopyStreams(ctx context.Context, cfg *AttachConfig) <-chan error {
        var group errgroup.Group

        // Connect stdin of container to the attach stdin stream.
        if cfg.Stdin != nil {
                group.Go(func() error {
                        log.G(ctx).Debug("attach: stdin: begin")
                        defer log.G(ctx).Debug("attach: stdin: end")

                        defer func() {
                                if cfg.CloseStdin && !cfg.TTY {
                                        cfg.CStdin.Close()
                                } else {
                                        // No matter what, when stdin is closed (io.Copy unblock), close stdout and stderr
                                        if cfg.CStdout != nil {
                                                cfg.CStdout.Close()
                                        }
                                        if cfg.CStderr != nil {
                                                cfg.CStderr.Close()
                                        }
                                }
                        }()

                        var err error
                        if cfg.TTY {
                                _, err = copyEscapable(cfg.CStdin, cfg.Stdin, cfg.DetachKeys)
                        } else {
                                _, err = pools.Copy(cfg.CStdin, cfg.Stdin)
                        }
                        if errors.Is(err, io.ErrClosedPipe) {
                                err = nil
                        }
                        if err != nil {
                                log.G(ctx).WithError(err).Debug("error on attach stdin")
                                return errors.Wrap(err, "error on attach stdin")
                        }
                        return nil
                })
        }

        attachStream := func(name string, stream io.Writer, streamPipe io.ReadCloser) error {
                log.G(ctx).Debugf("attach: %s: begin", name)
                defer log.G(ctx).Debugf("attach: %s: end", name)
                defer func() {
                        // Make sure stdin gets closed
                        if cfg.Stdin != nil {
                                cfg.Stdin.Close()
                        }
                        streamPipe.Close()
                }()

                _, err := pools.Copy(stream, streamPipe)
                if errors.Is(err, io.ErrClosedPipe) {
                        err = nil
                }
                if err != nil {
                        log.G(ctx).WithError(err).Debugf("attach: %s", name)
                        return errors.Wrapf(err, "error attaching %s stream", name)
                }
                return nil
        }

        if cfg.Stdout != nil {
                group.Go(func() error {
                        return attachStream("stdout", cfg.Stdout, cfg.CStdout)
                })
        }
        if cfg.Stderr != nil {
                group.Go(func() error {
                        return attachStream("stderr", cfg.Stderr, cfg.CStderr)
                })
        }

        errs := make(chan error, 1)
        go func() {
                defer log.G(ctx).Debug("attach done")
                groupErr := make(chan error, 1)
                go func() {
                        groupErr <- group.Wait()
                }()
                select {
                case <-ctx.Done():
                        // close all pipes
                        if cfg.CStdin != nil {
                                cfg.CStdin.Close()
                        }
                        if cfg.CStdout != nil {
                                cfg.CStdout.Close()
                        }
                        if cfg.CStderr != nil {
                                cfg.CStderr.Close()
                        }

                        if cfg.Stdin != nil {
                                // In this case, `cfg.Stdin` is a stream from the client.
                                // The way `io.Copy` works we may get stuck waiting to read from `cfg.Stdin` even if the container has exited.
                                // This will cause the `io.Copy` to never return and the `group.Wait()` to never return.
                                // By closing cfg.Stdin we will cause the `io.Copy` to return and the `group.Wait()` to return.
                                cfg.Stdin.Close()
                        }

                        // Now with these closed, wait should return.
                        if err := group.Wait(); err != nil {
                                errs <- err
                                return
                        }
                        errs <- ctx.Err()
                case err := <-groupErr:
                        errs <- err
                }
        }()

        return errs
}

func copyEscapable(dst io.Writer, src io.ReadCloser, keys []byte) (written int64, _ error) {
        if len(keys) == 0 {
                keys = defaultEscapeSequence
        }
        pr := term.NewEscapeProxy(src, keys)
        defer src.Close()

        return pools.Copy(dst, pr)
}

package bytespipe

import (
        "errors"
        "io"
)

var errBufferFull = errors.New("buffer is full")

type fixedBuffer struct {
        buf      []byte
        pos      int
        lastRead int
}

func (b *fixedBuffer) Write(p []byte) (int, error) {
        n := copy(b.buf[b.pos:cap(b.buf)], p)
        b.pos += n

        if n < len(p) {
                if b.pos == cap(b.buf) {
                        return n, errBufferFull
                }
                return n, io.ErrShortWrite
        }
        return n, nil
}

func (b *fixedBuffer) Read(p []byte) (int, error) {
        n := copy(p, b.buf[b.lastRead:b.pos])
        b.lastRead += n
        return n, nil
}

func (b *fixedBuffer) Len() int {
        return b.pos - b.lastRead
}

func (b *fixedBuffer) Cap() int {
        return cap(b.buf)
}

func (b *fixedBuffer) Reset() {
        b.pos = 0
        b.lastRead = 0
        b.buf = b.buf[:0]
}

func (b *fixedBuffer) String() string {
        return string(b.buf[b.lastRead:b.pos])
}

package bytespipe

import (
        "errors"
        "io"
        "sync"
)

// maxCap is the highest capacity to use in byte slices that buffer data.
const maxCap = 1e6

// minCap is the lowest capacity to use in byte slices that buffer data
const minCap = 64

// blockThreshold is the minimum number of bytes in the buffer which will cause
// a write to BytesPipe to block when allocating a new slice.
const blockThreshold = 1e6

var (
        // ErrClosed is returned when Write is called on a closed BytesPipe.
        ErrClosed = errors.New("write to closed BytesPipe")

        bufPools     = make(map[int]*sync.Pool)
        bufPoolsLock sync.Mutex
)

// BytesPipe is io.ReadWriteCloser which works similarly to pipe(queue).
// All written data may be read at most once. Also, BytesPipe allocates
// and releases new byte slices to adjust to current needs, so the buffer
// won't be overgrown after peak loads.
type BytesPipe struct {
        mu        sync.Mutex
        wait      *sync.Cond
        buf       []*fixedBuffer
        bufLen    int
        closeErr  error // error to return from next Read. set to nil if not closed.
        readBlock bool  // check read BytesPipe is Wait() or not
}

// New creates new BytesPipe, initialized by specified slice.
// If buf is nil, then it will be initialized with slice which cap is 64.
// buf will be adjusted in a way that len(buf) == 0, cap(buf) == cap(buf).
func New() *BytesPipe {
        bp := &BytesPipe{}
        bp.buf = append(bp.buf, getBuffer(minCap))
        bp.wait = sync.NewCond(&bp.mu)
        return bp
}

// Write writes p to BytesPipe.
// It can allocate new []byte slices in a process of writing.
func (bp *BytesPipe) Write(p []byte) (int, error) {
        bp.mu.Lock()
        defer bp.mu.Unlock()

        written := 0
loop0:
        for {
                if bp.closeErr != nil {
                        return written, ErrClosed
                }

                if len(bp.buf) == 0 {
                        bp.buf = append(bp.buf, getBuffer(64))
                }
                // get the last buffer
                b := bp.buf[len(bp.buf)-1]

                n, err := b.Write(p)
                written += n
                bp.bufLen += n

                // errBufferFull is an error we expect to get if the buffer is full
                if err != nil && !errors.Is(err, errBufferFull) {
                        bp.wait.Broadcast()
                        return written, err
                }

                // if there was enough room to write all then break
                if len(p) == n {
                        break
                }

                // more data: write to the next slice
                p = p[n:]

                // make sure the buffer doesn't grow too big from this write
                for bp.bufLen >= blockThreshold {
                        if bp.readBlock {
                                bp.wait.Broadcast()
                        }
                        bp.wait.Wait()
                        if bp.closeErr != nil {
                                continue loop0
                        }
                }

                // add new byte slice to the buffers slice and continue writing
                nextCap := b.Cap() * 2
                if nextCap > maxCap {
                        nextCap = maxCap
                }
                bp.buf = append(bp.buf, getBuffer(nextCap))
        }
        bp.wait.Broadcast()
        return written, nil
}

// CloseWithError causes further reads from a BytesPipe to return immediately.
func (bp *BytesPipe) CloseWithError(err error) error {
        bp.mu.Lock()
        if err != nil {
                bp.closeErr = err
        } else {
                bp.closeErr = io.EOF
        }
        bp.wait.Broadcast()
        bp.mu.Unlock()
        return nil
}

// Close causes further reads from a BytesPipe to return immediately.
func (bp *BytesPipe) Close() error {
        return bp.CloseWithError(nil)
}

// Read reads bytes from BytesPipe.
// Data could be read only once.
func (bp *BytesPipe) Read(p []byte) (int, error) {
        bp.mu.Lock()
        defer bp.mu.Unlock()
        if bp.bufLen == 0 {
                if bp.closeErr != nil {
                        return 0, bp.closeErr
                }
                bp.readBlock = true
                bp.wait.Wait()
                bp.readBlock = false
                if bp.bufLen == 0 && bp.closeErr != nil {
                        return 0, bp.closeErr
                }
        }

        var n int
        for bp.bufLen > 0 {
                b := bp.buf[0]
                read, _ := b.Read(p) // ignore error since fixedBuffer doesn't really return an error
                n += read
                bp.bufLen -= read

                if b.Len() == 0 {
                        // it's empty so return it to the pool and move to the next one
                        returnBuffer(b)
                        bp.buf[0] = nil
                        bp.buf = bp.buf[1:]
                }

                if len(p) == read {
                        break
                }

                p = p[read:]
        }

        bp.wait.Broadcast()
        return n, nil
}

func returnBuffer(b *fixedBuffer) {
        b.Reset()
        bufPoolsLock.Lock()
        pool := bufPools[b.Cap()]
        bufPoolsLock.Unlock()
        if pool != nil {
                pool.Put(b)
        }
}

func getBuffer(size int) *fixedBuffer {
        bufPoolsLock.Lock()
        pool, ok := bufPools[size]
        if !ok {
                pool = &sync.Pool{New: func() interface{} { return &fixedBuffer{buf: make([]byte, 0, size)} }}
                bufPools[size] = pool
        }
        bufPoolsLock.Unlock()
        return pool.Get().(*fixedBuffer)
}

package stream

import (
        "context"
        "errors"
        "fmt"
        "io"
        "sync"
        "sync/atomic"

        "github.com/containerd/containerd/v2/pkg/cio"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/internal/stream/bytespipe"
        "github.com/docker/docker/pkg/pools"
)

// Config holds information about I/O streams managed together.
//
// config.StdinPipe returns a WriteCloser which can be used to feed data
// to the standard input of the streamConfig's active process.
// config.StdoutPipe and streamConfig.StderrPipe each return a ReadCloser
// which can be used to retrieve the standard output (and error) generated
// by the container's active process. The output (and error) are actually
// copied and delivered to all StdoutPipe and StderrPipe consumers, using
// a kind of "broadcaster".
type Config struct {
        wg        sync.WaitGroup
        stdout    *unbuffered
        stderr    *unbuffered
        stdin     io.ReadCloser
        stdinPipe io.WriteCloser
        dio       *cio.DirectIO
        // closed is set to true when CloseStreams is called
        closed atomic.Bool
}

// NewConfig creates a stream config and initializes
// the standard err and standard out to new unbuffered broadcasters.
func NewConfig() *Config {
        return &Config{
                stderr: new(unbuffered),
                stdout: new(unbuffered),
        }
}

// Stdout returns the standard output in the configuration.
func (c *Config) Stdout() io.Writer {
        return c.stdout
}

// Stderr returns the standard error in the configuration.
func (c *Config) Stderr() io.Writer {
        return c.stderr
}

// Stdin returns the standard input in the configuration.
func (c *Config) Stdin() io.ReadCloser {
        return c.stdin
}

// StdinPipe returns an input writer pipe as an io.WriteCloser.
func (c *Config) StdinPipe() io.WriteCloser {
        return c.stdinPipe
}

// StdoutPipe creates a new io.ReadCloser with an empty bytes pipe.
// It adds this new out pipe to the Stdout broadcaster.
// This will block stdout if unconsumed.
func (c *Config) StdoutPipe() io.ReadCloser {
        bytesPipe := bytespipe.New()
        c.stdout.Add(bytesPipe)
        return bytesPipe
}

// StderrPipe creates a new io.ReadCloser with an empty bytes pipe.
// It adds this new err pipe to the Stderr broadcaster.
// This will block stderr if unconsumed.
func (c *Config) StderrPipe() io.ReadCloser {
        bytesPipe := bytespipe.New()
        c.stderr.Add(bytesPipe)
        return bytesPipe
}

// NewInputPipes creates new pipes for both standard inputs, Stdin and StdinPipe.
func (c *Config) NewInputPipes() {
        c.stdin, c.stdinPipe = io.Pipe()
}

// NewNopInputPipe creates a new input pipe that will silently drop all messages in the input.
func (c *Config) NewNopInputPipe() {
        c.stdinPipe = &nopWriteCloser{io.Discard}
}

type nopWriteCloser struct {
        io.Writer
}

func (w *nopWriteCloser) Close() error { return nil }

// CloseStreams ensures that the configured streams are properly closed.
func (c *Config) CloseStreams() error {
        var errs error

        c.closed.Store(true)

        if c.stdin != nil {
                if err := c.stdin.Close(); err != nil {
                        errs = errors.Join(errs, fmt.Errorf("error close stdin: %w", err))
                }
        }

        if err := c.stdout.Clean(); err != nil {
                errs = errors.Join(errs, fmt.Errorf("error close stdout: %w", err))
        }

        if err := c.stderr.Clean(); err != nil {
                errs = errors.Join(errs, fmt.Errorf("error close stderr: %w", err))
        }

        return errs
}

// CopyToPipe connects streamconfig with a libcontainerd.IOPipe
func (c *Config) CopyToPipe(iop *cio.DirectIO) {
        ctx := context.TODO()

        c.dio = iop
        copyFunc := func(name string, w io.Writer, r io.ReadCloser) {
                c.wg.Add(1)
                go func() {
                        defer c.wg.Done()
                        if _, err := pools.Copy(w, r); err != nil {
                                if c.closed.Load() {
                                        return
                                }
                                log.G(ctx).WithFields(log.Fields{"stream": name, "error": err}).Error("copy stream failed")
                        }
                        if err := r.Close(); err != nil && !c.closed.Load() {
                                log.G(ctx).WithFields(log.Fields{"stream": name, "error": err}).Warn("close stream failed")
                        }
                }()
        }

        if iop.Stdout != nil {
                copyFunc("stdout", c.Stdout(), iop.Stdout)
        }
        if iop.Stderr != nil {
                copyFunc("stderr", c.Stderr(), iop.Stderr)
        }

        if stdin := c.Stdin(); stdin != nil {
                if iop.Stdin != nil {
                        go func() {
                                _, err := pools.Copy(iop.Stdin, stdin)
                                if err != nil {
                                        if c.closed.Load() {
                                                return
                                        }
                                        log.G(ctx).WithFields(log.Fields{"stream": "stdin", "error": err}).Error("copy stream failed")
                                }
                                if err := iop.Stdin.Close(); err != nil && !c.closed.Load() {
                                        log.G(ctx).WithFields(log.Fields{"stream": "stdin", "error": err}).Warn("close stream failed")
                                }
                        }()
                }
        }
}

// Wait for the stream to close
// Wait supports timeouts via the context to unblock and forcefully
// close the io streams
func (c *Config) Wait(ctx context.Context) {
        done := make(chan struct{}, 1)
        go func() {
                c.wg.Wait()
                close(done)
        }()
        select {
        case <-done:
        case <-ctx.Done():
                if c.dio != nil {
                        c.dio.Cancel()
                        c.dio.Wait()
                        c.dio.Close()
                }
        }
}

package stream

import (
        "io"
        "sync"
)

// unbuffered accumulates multiple io.WriteCloser by stream.
type unbuffered struct {
        mu      sync.Mutex
        writers []io.WriteCloser
}

// Add adds new io.WriteCloser.
func (w *unbuffered) Add(writer io.WriteCloser) {
        w.mu.Lock()
        w.writers = append(w.writers, writer)
        w.mu.Unlock()
}

// Write writes bytes to all writers. Failed writers will be evicted during
// this call.
func (w *unbuffered) Write(p []byte) (int, error) {
        w.mu.Lock()
        var evict []int
        for i, sw := range w.writers {
                if n, err := sw.Write(p); err != nil || n != len(p) {
                        // On error, evict the writer
                        evict = append(evict, i)
                }
        }
        for n, i := range evict {
                w.writers = append(w.writers[:i-n], w.writers[i-n+1:]...)
        }
        w.mu.Unlock()
        return len(p), nil
}

// Clean closes and removes all writers. Last non-eol-terminated part of data
// will be saved.
func (w *unbuffered) Clean() error {
        w.mu.Lock()
        for _, sw := range w.writers {
                sw.Close()
        }
        w.writers = nil
        w.mu.Unlock()
        return nil
}

package libnetwork

//go:generate protoc -I=. -I=../../vendor/ --gogofaster_out=import_path=github.com/docker/docker/daemon/libnetwork:. agent.proto

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "sort"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/cluster"
        "github.com/docker/docker/daemon/libnetwork/discoverapi"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/networkdb"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/go-events"
        "github.com/gogo/protobuf/proto"
)

const (
        subsysGossip = "networking:gossip"
        subsysIPSec  = "networking:ipsec"
        keyringSize  = 3
)

// ByTime implements sort.Interface for []*types.EncryptionKey based on
// the LamportTime field.
type ByTime []*types.EncryptionKey

func (b ByTime) Len() int           { return len(b) }
func (b ByTime) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
func (b ByTime) Less(i, j int) bool { return b[i].LamportTime < b[j].LamportTime }

type nwAgent struct {
        networkDB         *networkdb.NetworkDB
        bindAddr          net.IP
        advertiseAddr     string
        dataPathAddr      string
        coreCancelFuncs   []func()
        driverCancelFuncs map[string][]func()
        mu                sync.Mutex
}

func (a *nwAgent) dataPathAddress() string {
        a.mu.Lock()
        defer a.mu.Unlock()
        if a.dataPathAddr != "" {
                return a.dataPathAddr
        }
        return a.advertiseAddr
}

const libnetworkEPTable = "endpoint_table"

func getBindAddr(ifaceName string) (net.IP, error) {
        iface, err := net.InterfaceByName(ifaceName)
        if err != nil {
                return nil, fmt.Errorf("failed to find interface %s: %v", ifaceName, err)
        }

        addrs, err := iface.Addrs()
        if err != nil {
                return nil, fmt.Errorf("failed to get interface addresses: %v", err)
        }

        for _, a := range addrs {
                addr, ok := a.(*net.IPNet)
                if !ok {
                        continue
                }
                addrIP := addr.IP

                if addrIP.IsLinkLocalUnicast() {
                        continue
                }

                return addrIP, nil
        }

        return nil, errors.New("failed to get bind address")
}

// resolveAddr resolves the given address, which can be one of, and
// parsed in the following order or priority:
//
// - a well-formed IP-address
// - a hostname
// - an interface-name
func resolveAddr(addrOrInterface string) (net.IP, error) {
        // Try and see if this is a valid IP address
        if ip := net.ParseIP(addrOrInterface); ip != nil {
                return ip, nil
        }

        // If not a valid IP address, it could be a hostname.
        addr, err := net.ResolveIPAddr("ip", addrOrInterface)
        if err != nil {
                // If hostname lookup failed, try to look for an interface with the given name.
                return getBindAddr(addrOrInterface)
        }
        return addr.IP, nil
}

func (c *Controller) handleKeyChange(keys []*types.EncryptionKey) error {
        drvEnc := discoverapi.DriverEncryptionUpdate{}

        agent := c.getAgent()
        if agent == nil {
                log.G(context.TODO()).Debug("Skipping key change as agent is nil")
                return nil
        }

        // Find the deleted key. If the deleted key was the primary key,
        // a new primary key should be set before removing if from keyring.
        c.mu.Lock()
        added := []byte{}
        deleted := []byte{}
        j := len(c.keys)
        for i := 0; i < j; {
                same := false
                for _, key := range keys {
                        if same = key.LamportTime == c.keys[i].LamportTime; same {
                                break
                        }
                }
                if !same {
                        cKey := c.keys[i]
                        if cKey.Subsystem == subsysGossip {
                                deleted = cKey.Key
                        }

                        if cKey.Subsystem == subsysIPSec {
                                drvEnc.Prune = cKey.Key
                                drvEnc.PruneTag = cKey.LamportTime
                        }
                        c.keys[i], c.keys[j-1] = c.keys[j-1], c.keys[i]
                        c.keys[j-1] = nil
                        j--
                }
                i++
        }
        c.keys = c.keys[:j]

        // Find the new key and add it to the key ring
        for _, key := range keys {
                same := false
                for _, cKey := range c.keys {
                        if same = cKey.LamportTime == key.LamportTime; same {
                                break
                        }
                }
                if !same {
                        c.keys = append(c.keys, key)
                        if key.Subsystem == subsysGossip {
                                added = key.Key
                        }

                        if key.Subsystem == subsysIPSec {
                                drvEnc.Key = key.Key
                                drvEnc.Tag = key.LamportTime
                        }
                }
        }
        c.mu.Unlock()

        if len(added) > 0 {
                agent.networkDB.SetKey(added)
        }

        key, _, err := c.getPrimaryKeyTag(subsysGossip)
        if err != nil {
                return err
        }
        agent.networkDB.SetPrimaryKey(key)

        key, tag, err := c.getPrimaryKeyTag(subsysIPSec)
        if err != nil {
                return err
        }
        drvEnc.Primary = key
        drvEnc.PrimaryTag = tag

        if len(deleted) > 0 {
                agent.networkDB.RemoveKey(deleted)
        }

        c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
                dr, ok := driver.(discoverapi.Discover)
                if !ok {
                        return false
                }
                if err := dr.DiscoverNew(discoverapi.EncryptionKeysUpdate, drvEnc); err != nil {
                        log.G(context.TODO()).Warnf("Failed to update datapath keys in driver %s: %v", name, err)
                        // Attempt to reconfigure keys in case of a update failure
                        // which can arise due to a mismatch of keys
                        // if worker nodes get temporarily disconnected
                        log.G(context.TODO()).Warnf("Reconfiguring datapath keys for  %s", name)
                        drvCfgEnc := discoverapi.DriverEncryptionConfig{}
                        drvCfgEnc.Keys, drvCfgEnc.Tags = c.getKeys(subsysIPSec)
                        err = dr.DiscoverNew(discoverapi.EncryptionKeysConfig, drvCfgEnc)
                        if err != nil {
                                log.G(context.TODO()).Warnf("Failed to reset datapath keys in driver %s: %v", name, err)
                        }
                }
                return false
        })

        return nil
}

func (c *Controller) agentSetup(clusterProvider cluster.Provider) error {
        agent := c.getAgent()
        if agent != nil {
                // agent is already present, so there is no need initialize it again.
                return nil
        }

        bindAddr := clusterProvider.GetLocalAddress()
        advAddr := clusterProvider.GetAdvertiseAddress()
        dataAddr := clusterProvider.GetDataPathAddress()
        remoteList := clusterProvider.GetRemoteAddressList()
        remoteAddrList := make([]string, 0, len(remoteList))
        for _, remote := range remoteList {
                addr, _, _ := net.SplitHostPort(remote)
                remoteAddrList = append(remoteAddrList, addr)
        }

        listen := clusterProvider.GetListenAddress()
        listenAddr, _, _ := net.SplitHostPort(listen)

        log.G(context.TODO()).WithFields(log.Fields{
                "listen-addr":               listenAddr,
                "local-addr":                bindAddr,
                "advertise-addr":            advAddr,
                "data-path-addr":            dataAddr,
                "remote-addr-list":          remoteAddrList,
                "network-control-plane-mtu": c.Config().NetworkControlPlaneMTU,
        }).Info("Initializing Libnetwork Agent")
        if advAddr != "" {
                if err := c.agentInit(listenAddr, bindAddr, advAddr, dataAddr); err != nil {
                        log.G(context.TODO()).WithError(err).Errorf("Error in agentInit")
                        return err
                }
                c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
                        if capability.ConnectivityScope == scope.Global {
                                if d, ok := driver.(discoverapi.Discover); ok {
                                        c.agentDriverNotify(d)
                                }
                        }
                        return false
                })
        }

        if len(remoteAddrList) > 0 {
                if err := c.agentJoin(remoteAddrList); err != nil {
                        log.G(context.TODO()).WithError(err).Error("Error in joining gossip cluster: join will be retried in background")
                }
        }

        return nil
}

// For a given subsystem getKeys sorts the keys by lamport time and returns
// slice of keys and lamport time which can used as a unique tag for the keys
func (c *Controller) getKeys(subsystem string) (keys [][]byte, tags []uint64) {
        c.mu.Lock()
        defer c.mu.Unlock()

        sort.Sort(ByTime(c.keys))

        keys = make([][]byte, 0, len(c.keys))
        tags = make([]uint64, 0, len(c.keys))
        for _, key := range c.keys {
                if key.Subsystem == subsystem {
                        keys = append(keys, key.Key)
                        tags = append(tags, key.LamportTime)
                }
        }

        if len(keys) > 1 {
                // TODO(thaJeztah): why are we swapping order here? This code was added in https://github.com/moby/libnetwork/commit/e83d68b7d1fd9c479120914024242238f791b4dc
                keys[0], keys[1] = keys[1], keys[0]
                tags[0], tags[1] = tags[1], tags[0]
        }
        return keys, tags
}

// getPrimaryKeyTag returns the primary key for a given subsystem from the
// list of sorted key and the associated tag
func (c *Controller) getPrimaryKeyTag(subsystem string) (key []byte, lamportTime uint64, _ error) {
        c.mu.Lock()
        defer c.mu.Unlock()
        sort.Sort(ByTime(c.keys))
        keys := make([]*types.EncryptionKey, 0, len(c.keys))
        for _, k := range c.keys {
                if k.Subsystem == subsystem {
                        keys = append(keys, k)
                }
        }
        if len(keys) < 2 {
                return nil, 0, fmt.Errorf("no primary key found for %s subsystem: %d keys found on controller, expected at least 2", subsystem, len(keys))
        }
        return keys[1].Key, keys[1].LamportTime, nil
}

func (c *Controller) agentInit(listenAddr, bindAddrOrInterface, advertiseAddr, dataPathAddr string) error {
        bindAddr, err := resolveAddr(bindAddrOrInterface)
        if err != nil {
                return err
        }

        keys, _ := c.getKeys(subsysGossip)

        netDBConf := networkdb.DefaultConfig()
        netDBConf.BindAddr = listenAddr
        netDBConf.AdvertiseAddr = advertiseAddr
        netDBConf.Keys = keys
        if c.Config().NetworkControlPlaneMTU != 0 {
                // Consider the MTU remove the IP hdr (IPv4 or IPv6) and the TCP/UDP hdr.
                // To be on the safe side let's cut 100 bytes
                netDBConf.PacketBufferSize = (c.Config().NetworkControlPlaneMTU - 100)
                log.G(context.TODO()).Debugf("Control plane MTU: %d will initialize NetworkDB with: %d",
                        c.Config().NetworkControlPlaneMTU, netDBConf.PacketBufferSize)
        }
        nDB, err := networkdb.New(netDBConf)
        if err != nil {
                return err
        }

        // Register the diagnostic handlers
        nDB.RegisterDiagnosticHandlers(c.diagnosticServer)

        var cancelList []func()
        ch, cancel := nDB.Watch(libnetworkEPTable, "")
        cancelList = append(cancelList, cancel)
        nodeCh, cancel := nDB.Watch(networkdb.NodeTable, "")
        cancelList = append(cancelList, cancel)

        c.mu.Lock()
        c.agent = &nwAgent{
                networkDB:         nDB,
                bindAddr:          bindAddr,
                advertiseAddr:     advertiseAddr,
                dataPathAddr:      dataPathAddr,
                coreCancelFuncs:   cancelList,
                driverCancelFuncs: make(map[string][]func()),
        }
        c.mu.Unlock()

        go c.handleTableEvents(ch, c.handleEpTableEvent)
        go c.handleTableEvents(nodeCh, c.handleNodeTableEvent)

        keys, tags := c.getKeys(subsysIPSec)
        c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
                if dr, ok := driver.(discoverapi.Discover); ok {
                        if err := dr.DiscoverNew(discoverapi.EncryptionKeysConfig, discoverapi.DriverEncryptionConfig{
                                Keys: keys,
                                Tags: tags,
                        }); err != nil {
                                log.G(context.TODO()).Warnf("Failed to set datapath keys in driver %s: %v", name, err)
                        }
                }
                return false
        })

        c.WalkNetworks(joinCluster)

        return nil
}

func (c *Controller) agentJoin(remoteAddrList []string) error {
        agent := c.getAgent()
        if agent == nil {
                return nil
        }
        return agent.networkDB.Join(remoteAddrList)
}

func (c *Controller) agentDriverNotify(d discoverapi.Discover) {
        agent := c.getAgent()
        if agent == nil {
                return
        }

        if err := d.DiscoverNew(discoverapi.NodeDiscovery, discoverapi.NodeDiscoveryData{
                Address:     agent.dataPathAddress(),
                BindAddress: agent.bindAddr.String(),
                Self:        true,
        }); err != nil {
                log.G(context.TODO()).Warnf("Failed the node discovery in driver: %v", err)
        }

        keys, tags := c.getKeys(subsysIPSec)
        if err := d.DiscoverNew(discoverapi.EncryptionKeysConfig, discoverapi.DriverEncryptionConfig{
                Keys: keys,
                Tags: tags,
        }); err != nil {
                log.G(context.TODO()).Warnf("Failed to set datapath keys in driver: %v", err)
        }
}

func (c *Controller) agentClose() {
        // Acquire current agent instance and reset its pointer
        // then run closing functions
        c.mu.Lock()
        agent := c.agent
        c.agent = nil
        c.mu.Unlock()

        // when the agent is closed the cluster provider should be cleaned up
        c.SetClusterProvider(nil)

        if agent == nil {
                return
        }

        var cancelList []func()

        agent.mu.Lock()
        for _, cancelFuncs := range agent.driverCancelFuncs {
                cancelList = append(cancelList, cancelFuncs...)
        }

        // Add also the cancel functions for the network db
        cancelList = append(cancelList, agent.coreCancelFuncs...)
        agent.mu.Unlock()

        for _, cancel := range cancelList {
                cancel()
        }

        agent.networkDB.Close()
}

// Task has the backend container details
type Task struct {
        Name       string
        EndpointID string
        EndpointIP string
        Info       map[string]string
}

// ServiceInfo has service specific details along with the list of backend tasks
type ServiceInfo struct {
        VIP          string
        LocalLBIndex int
        Tasks        []Task
        Ports        []string
}

type epRecord struct {
        ep      EndpointRecord
        info    map[string]string
        lbIndex int
}

// Services returns a map of services keyed by the service name with the details
// of all the tasks that belong to the service. Applicable only in swarm mode.
func (n *Network) Services() map[string]ServiceInfo {
        agent, ok := n.clusterAgent()
        if !ok {
                return nil
        }
        nwID := n.ID()
        d, err := n.driver(true)
        if err != nil {
                log.G(context.TODO()).Errorf("Could not resolve driver for network %s/%s while fetching services: %v", n.networkType, nwID, err)
                return nil
        }

        // Walk through libnetworkEPTable and fetch the driver agnostic endpoint info
        eps := make(map[string]epRecord)
        c := n.getController()
        for eid, value := range agent.networkDB.GetTableByNetwork(libnetworkEPTable, nwID) {
                var epRec EndpointRecord
                if err := proto.Unmarshal(value.Value, &epRec); err != nil {
                        log.G(context.TODO()).Errorf("Unmarshal of libnetworkEPTable failed for endpoint %s in network %s, %v", eid, nwID, err)
                        continue
                }
                eps[eid] = epRecord{
                        ep:      epRec,
                        lbIndex: c.getLBIndex(epRec.ServiceID, nwID, epRec.IngressPorts),
                }
        }

        // Walk through the driver's tables, have the driver decode the entries
        // and return the tuple {ep ID, value}. value is a string that coveys
        // relevant info about the endpoint.
        for _, table := range n.driverTables {
                if table.objType != driverapi.EndpointObject {
                        continue
                }
                for key, value := range agent.networkDB.GetTableByNetwork(table.name, nwID) {
                        epID, info := d.DecodeTableEntry(table.name, key, value.Value)
                        if ep, ok := eps[epID]; !ok {
                                log.G(context.TODO()).Errorf("Inconsistent driver and libnetwork state for endpoint %s", epID)
                        } else {
                                ep.info = info
                                eps[epID] = ep
                        }
                }
        }

        // group the endpoints into a map keyed by the service name
        sinfo := make(map[string]ServiceInfo)
        for ep, epr := range eps {
                s, ok := sinfo[epr.ep.ServiceName]
                if !ok {
                        s = ServiceInfo{
                                VIP:          epr.ep.VirtualIP,
                                LocalLBIndex: epr.lbIndex,
                        }
                }
                if s.Ports == nil {
                        ports := make([]string, 0, len(epr.ep.IngressPorts))
                        for _, port := range epr.ep.IngressPorts {
                                ports = append(ports, fmt.Sprintf("Target: %d, Publish: %d", port.TargetPort, port.PublishedPort))
                        }
                        s.Ports = ports
                }
                s.Tasks = append(s.Tasks, Task{
                        Name:       epr.ep.Name,
                        EndpointID: ep,
                        EndpointIP: epr.ep.EndpointIP,
                        Info:       epr.info,
                })
                sinfo[epr.ep.ServiceName] = s
        }
        return sinfo
}

// clusterAgent returns the cluster agent if the network is a swarm-scoped,
// multi-host network.
func (n *Network) clusterAgent() (agent *nwAgent, ok bool) {
        if n.scope != scope.Swarm || !n.driverIsMultihost() {
                return nil, false
        }
        a := n.getController().getAgent()
        return a, a != nil
}

func (n *Network) joinCluster() error {
        agent, ok := n.clusterAgent()
        if !ok {
                return nil
        }
        return agent.networkDB.JoinNetwork(n.ID())
}

func (n *Network) leaveCluster() error {
        agent, ok := n.clusterAgent()
        if !ok {
                return nil
        }
        return agent.networkDB.LeaveNetwork(n.ID())
}

func (ep *Endpoint) addDriverInfoToCluster() error {
        if ep.joinInfo == nil || len(ep.joinInfo.driverTableEntries) == 0 {
                return nil
        }
        n := ep.getNetwork()
        agent, ok := n.clusterAgent()
        if !ok {
                return nil
        }

        nwID := n.ID()
        for _, te := range ep.joinInfo.driverTableEntries {
                if err := agent.networkDB.CreateEntry(te.tableName, nwID, te.key, te.value); err != nil {
                        return err
                }
        }
        return nil
}

func (ep *Endpoint) deleteDriverInfoFromCluster() error {
        if ep.joinInfo == nil || len(ep.joinInfo.driverTableEntries) == 0 {
                return nil
        }
        n := ep.getNetwork()
        agent, ok := n.clusterAgent()
        if !ok {
                return nil
        }

        nwID := n.ID()
        for _, te := range ep.joinInfo.driverTableEntries {
                if err := agent.networkDB.DeleteEntry(te.tableName, nwID, te.key); err != nil {
                        return err
                }
        }
        return nil
}

func (ep *Endpoint) addServiceInfoToCluster(sb *Sandbox) error {
        if len(ep.dnsNames) == 0 || ep.Iface() == nil || ep.Iface().Address() == nil {
                return nil
        }

        n := ep.getNetwork()
        agent, ok := n.clusterAgent()
        if !ok {
                return nil
        }

        sb.service.Lock()
        defer sb.service.Unlock()
        log.G(context.TODO()).Debugf("addServiceInfoToCluster START for %s %s", ep.svcName, ep.ID())

        // Check that the endpoint is still present on the sandbox before adding it to the service discovery.
        // This is to handle a race between the EnableService and the sbLeave
        // It is possible that the EnableService starts, fetches the list of the endpoints and
        // by the time the addServiceInfoToCluster is called the endpoint got removed from the sandbox
        // The risk is that the deleteServiceInfoToCluster happens before the addServiceInfoToCluster.
        // This check under the Service lock of the sandbox ensure the correct behavior.
        // If the addServiceInfoToCluster arrives first may find or not the endpoint and will proceed or exit
        // but in any case the deleteServiceInfoToCluster will follow doing the cleanup if needed.
        // In case the deleteServiceInfoToCluster arrives first, this one is happening after the endpoint is
        // removed from the list, in this situation the delete will bail out not finding any data to cleanup
        // and the add will bail out not finding the endpoint on the sandbox.
        if err := sb.GetEndpoint(ep.ID()); err == nil {
                log.G(context.TODO()).Warnf("addServiceInfoToCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
                return nil
        }

        dnsNames := ep.getDNSNames()
        primaryDNSName, dnsAliases := dnsNames[0], dnsNames[1:]

        var ingressPorts []*PortConfig
        if ep.svcID != "" {
                // This is a task part of a service
                // Gossip ingress ports only in ingress network.
                if n.ingress {
                        ingressPorts = ep.ingressPorts
                }
                if err := n.getController().addServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), primaryDNSName, ep.virtualIP, ingressPorts, ep.svcAliases, dnsAliases, ep.Iface().Address().IP, "addServiceInfoToCluster"); err != nil {
                        return err
                }
        } else {
                // This is a container simply attached to an attachable network
                if err := n.getController().addContainerNameResolution(n.ID(), ep.ID(), primaryDNSName, dnsAliases, ep.Iface().Address().IP, "addServiceInfoToCluster"); err != nil {
                        return err
                }
        }

        buf, err := proto.Marshal(&EndpointRecord{
                Name:            primaryDNSName,
                ServiceName:     ep.svcName,
                ServiceID:       ep.svcID,
                VirtualIP:       ep.virtualIP.String(),
                IngressPorts:    ingressPorts,
                Aliases:         ep.svcAliases,
                TaskAliases:     dnsAliases,
                EndpointIP:      ep.Iface().Address().IP.String(),
                ServiceDisabled: false,
        })
        if err != nil {
                return err
        }

        if err := agent.networkDB.CreateEntry(libnetworkEPTable, n.ID(), ep.ID(), buf); err != nil {
                log.G(context.TODO()).Warnf("addServiceInfoToCluster NetworkDB CreateEntry failed for %s %s err:%s", ep.id, n.id, err)
                return err
        }

        log.G(context.TODO()).Debugf("addServiceInfoToCluster END for %s %s", ep.svcName, ep.ID())

        return nil
}

func (ep *Endpoint) deleteServiceInfoFromCluster(sb *Sandbox, fullRemove bool, method string) error {
        if len(ep.dnsNames) == 0 {
                return nil
        }

        n := ep.getNetwork()
        agent, ok := n.clusterAgent()
        if !ok {
                return nil
        }

        sb.service.Lock()
        defer sb.service.Unlock()
        log.G(context.TODO()).Debugf("deleteServiceInfoFromCluster from %s START for %s %s", method, ep.svcName, ep.ID())

        // Avoid a race w/ with a container that aborts preemptively.  This would
        // get caught in disableServiceInNetworkDB, but we check here to make the
        // nature of the condition more clear.
        // See comment in addServiceInfoToCluster()
        if err := sb.GetEndpoint(ep.ID()); err == nil {
                log.G(context.TODO()).Warnf("deleteServiceInfoFromCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
                return nil
        }

        dnsNames := ep.getDNSNames()
        primaryDNSName, dnsAliases := dnsNames[0], dnsNames[1:]

        // First update the networkDB then locally
        if fullRemove {
                if err := agent.networkDB.DeleteEntry(libnetworkEPTable, n.ID(), ep.ID()); err != nil {
                        log.G(context.TODO()).Warnf("deleteServiceInfoFromCluster NetworkDB DeleteEntry failed for %s %s err:%s", ep.id, n.id, err)
                }
        } else {
                disableServiceInNetworkDB(agent, n, ep)
        }

        if ep.Iface() != nil && ep.Iface().Address() != nil {
                if ep.svcID != "" {
                        // This is a task part of a service
                        var ingressPorts []*PortConfig
                        if n.ingress {
                                ingressPorts = ep.ingressPorts
                        }
                        if err := n.getController().rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), primaryDNSName, ep.virtualIP, ingressPorts, ep.svcAliases, dnsAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster", true, fullRemove); err != nil {
                                return err
                        }
                } else {
                        // This is a container simply attached to an attachable network
                        if err := n.getController().delContainerNameResolution(n.ID(), ep.ID(), primaryDNSName, dnsAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster"); err != nil {
                                return err
                        }
                }
        }

        log.G(context.TODO()).Debugf("deleteServiceInfoFromCluster from %s END for %s %s", method, ep.svcName, ep.ID())

        return nil
}

func disableServiceInNetworkDB(a *nwAgent, n *Network, ep *Endpoint) {
        var epRec EndpointRecord

        log.G(context.TODO()).Debugf("disableServiceInNetworkDB for %s %s", ep.svcName, ep.ID())

        // Update existing record to indicate that the service is disabled
        inBuf, err := a.networkDB.GetEntry(libnetworkEPTable, n.ID(), ep.ID())
        if err != nil {
                log.G(context.TODO()).Warnf("disableServiceInNetworkDB GetEntry failed for %s %s err:%s", ep.id, n.id, err)
                return
        }
        // Should never fail
        if err := proto.Unmarshal(inBuf, &epRec); err != nil {
                log.G(context.TODO()).Errorf("disableServiceInNetworkDB unmarshal failed for %s %s err:%s", ep.id, n.id, err)
                return
        }
        epRec.ServiceDisabled = true
        // Should never fail
        outBuf, err := proto.Marshal(&epRec)
        if err != nil {
                log.G(context.TODO()).Errorf("disableServiceInNetworkDB marshalling failed for %s %s err:%s", ep.id, n.id, err)
                return
        }
        // Send update to the whole cluster
        if err := a.networkDB.UpdateEntry(libnetworkEPTable, n.ID(), ep.ID(), outBuf); err != nil {
                log.G(context.TODO()).Warnf("disableServiceInNetworkDB UpdateEntry failed for %s %s err:%s", ep.id, n.id, err)
        }
}

func (n *Network) addDriverWatches() {
        if len(n.driverTables) == 0 {
                return
        }
        agent, ok := n.clusterAgent()
        if !ok {
                return
        }

        c := n.getController()
        for _, table := range n.driverTables {
                ch, cancel := agent.networkDB.Watch(table.name, n.ID())
                agent.mu.Lock()
                agent.driverCancelFuncs[n.ID()] = append(agent.driverCancelFuncs[n.ID()], cancel)
                agent.mu.Unlock()
                go c.handleTableEvents(ch, n.handleDriverTableEvent)
        }
}

func (n *Network) cancelDriverWatches() {
        agent, ok := n.clusterAgent()
        if !ok {
                return
        }

        agent.mu.Lock()
        cancelFuncs := agent.driverCancelFuncs[n.ID()]
        delete(agent.driverCancelFuncs, n.ID())
        agent.mu.Unlock()

        for _, cancel := range cancelFuncs {
                cancel()
        }
}

func (c *Controller) handleTableEvents(ch *events.Channel, fn func(events.Event)) {
        for {
                select {
                case ev := <-ch.C:
                        fn(ev)
                case <-ch.Done():
                        return
                }
        }
}

func (n *Network) handleDriverTableEvent(ev events.Event) {
        d, err := n.driver(false)
        if err != nil {
                log.G(context.TODO()).Errorf("Could not resolve driver %s while handling driver table event: %v", n.networkType, err)
                return
        }

        var (
                etype driverapi.EventType
                tname string
                key   string
                value []byte
        )

        switch event := ev.(type) {
        case networkdb.CreateEvent:
                tname = event.Table
                key = event.Key
                value = event.Value
                etype = driverapi.Create
        case networkdb.DeleteEvent:
                tname = event.Table
                key = event.Key
                value = event.Value
                etype = driverapi.Delete
        case networkdb.UpdateEvent:
                tname = event.Table
                key = event.Key
                value = event.Value
                etype = driverapi.Update
        }

        d.EventNotify(etype, n.ID(), tname, key, value)
}

func (c *Controller) handleNodeTableEvent(ev events.Event) {
        var (
                value    []byte
                isAdd    bool
                nodeAddr networkdb.NodeAddr
        )
        switch event := ev.(type) {
        case networkdb.CreateEvent:
                value = event.Value
                isAdd = true
        case networkdb.DeleteEvent:
                value = event.Value
        case networkdb.UpdateEvent:
                log.G(context.TODO()).Errorf("Unexpected update node table event = %#v", event)
        }

        err := json.Unmarshal(value, &nodeAddr)
        if err != nil {
                log.G(context.TODO()).Errorf("Error unmarshalling node table event %v", err)
                return
        }
        c.processNodeDiscovery([]net.IP{nodeAddr.Addr}, isAdd)
}

func (c *Controller) handleEpTableEvent(ev events.Event) {
        var (
                nid   string
                eid   string
                value []byte
                epRec EndpointRecord
        )

        switch event := ev.(type) {
        case networkdb.CreateEvent:
                nid = event.NetworkID
                eid = event.Key
                value = event.Value
        case networkdb.DeleteEvent:
                nid = event.NetworkID
                eid = event.Key
                value = event.Value
        case networkdb.UpdateEvent:
                nid = event.NetworkID
                eid = event.Key
                value = event.Value
        default:
                log.G(context.TODO()).Errorf("Unexpected update service table event = %#v", event)
                return
        }

        err := proto.Unmarshal(value, &epRec)
        if err != nil {
                log.G(context.TODO()).WithError(err).Error("Failed to unmarshal service table value")
                return
        }

        containerName := epRec.Name
        svcName := epRec.ServiceName
        svcID := epRec.ServiceID
        vip := net.ParseIP(epRec.VirtualIP)
        ip := net.ParseIP(epRec.EndpointIP)
        ingressPorts := epRec.IngressPorts
        serviceAliases := epRec.Aliases
        taskAliases := epRec.TaskAliases

        logger := log.G(context.TODO()).WithFields(log.Fields{
                "nid": nid,
                "eid": eid,
                "T":   fmt.Sprintf("%T", ev),
                "R":   epRec,
        })

        if containerName == "" || ip == nil {
                logger.Errorf("Invalid endpoint name/ip received while handling service table event %s", value)
                return
        }

        logger.Debug("handleEpTableEvent")

        switch ev.(type) {
        case networkdb.CreateEvent, networkdb.UpdateEvent:
                if svcID != "" {
                        // This is a remote task part of a service
                        if epRec.ServiceDisabled {
                                if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true, false); err != nil {
                                        logger.WithError(err).Error("failed disabling service binding")
                                        return
                                }
                        } else {
                                if err := c.addServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent"); err != nil {
                                        logger.WithError(err).Error("failed adding service binding")
                                        return
                                }
                        }
                } else {
                        // This is a remote container simply attached to an attachable network
                        if err := c.addContainerNameResolution(nid, eid, containerName, taskAliases, ip, "handleEpTableEvent"); err != nil {
                                logger.WithError(err).Errorf("failed adding container name resolution")
                        }
                }

        case networkdb.DeleteEvent:
                if svcID != "" {
                        // This is a remote task part of a service
                        if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent", true, true); err != nil {
                                logger.WithError(err).Error("failed removing service binding")
                                return
                        }
                } else {
                        // This is a remote container simply attached to an attachable network
                        if err := c.delContainerNameResolution(nid, eid, containerName, taskAliases, ip, "handleEpTableEvent"); err != nil {
                                logger.WithError(err).Errorf("failed removing container name resolution")
                        }
                }
        }
}

// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: agent.proto

package libnetwork

import (
        fmt "fmt"
        _ "github.com/gogo/protobuf/gogoproto"
        proto "github.com/gogo/protobuf/proto"
        io "io"
        math "math"
        math_bits "math/bits"
        reflect "reflect"
        strings "strings"
)

// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf

// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package

type PortConfig_Protocol int32

const (
        ProtocolTCP  PortConfig_Protocol = 0
        ProtocolUDP  PortConfig_Protocol = 1
        ProtocolSCTP PortConfig_Protocol = 2
)

var PortConfig_Protocol_name = map[int32]string{
        0: "TCP",
        1: "UDP",
        2: "SCTP",
}

var PortConfig_Protocol_value = map[string]int32{
        "TCP":  0,
        "UDP":  1,
        "SCTP": 2,
}

func (x PortConfig_Protocol) String() string {
        return proto.EnumName(PortConfig_Protocol_name, int32(x))
}

func (PortConfig_Protocol) EnumDescriptor() ([]byte, []int) {
        return fileDescriptor_56ede974c0020f77, []int{1, 0}
}

// EndpointRecord specifies all the endpoint specific information that
// needs to gossiped to nodes participating in the network.
type EndpointRecord struct {
        // Name of the container
        Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
        // Service name of the service to which this endpoint belongs.
        ServiceName string `protobuf:"bytes,2,opt,name=service_name,json=serviceName,proto3" json:"service_name,omitempty"`
        // Service ID of the service to which this endpoint belongs.
        ServiceID string `protobuf:"bytes,3,opt,name=service_id,json=serviceId,proto3" json:"service_id,omitempty"`
        // Virtual IP of the service to which this endpoint belongs.
        VirtualIP string `protobuf:"bytes,4,opt,name=virtual_ip,json=virtualIp,proto3" json:"virtual_ip,omitempty"`
        // IP assigned to this endpoint.
        EndpointIP string `protobuf:"bytes,5,opt,name=endpoint_ip,json=endpointIp,proto3" json:"endpoint_ip,omitempty"`
        // IngressPorts exposed by the service to which this endpoint belongs.
        IngressPorts []*PortConfig `protobuf:"bytes,6,rep,name=ingress_ports,json=ingressPorts,proto3" json:"ingress_ports,omitempty"`
        // A list of aliases which are alternate names for the service
        Aliases []string `protobuf:"bytes,7,rep,name=aliases,proto3" json:"aliases,omitempty"`
        // List of aliases task specific aliases
        TaskAliases []string `protobuf:"bytes,8,rep,name=task_aliases,json=taskAliases,proto3" json:"task_aliases,omitempty"`
        // Whether this endpoint's service has been disabled
        ServiceDisabled bool `protobuf:"varint,9,opt,name=service_disabled,json=serviceDisabled,proto3" json:"service_disabled,omitempty"`
}

func (m *EndpointRecord) Reset()      { *m = EndpointRecord{} }
func (*EndpointRecord) ProtoMessage() {}
func (*EndpointRecord) Descriptor() ([]byte, []int) {
        return fileDescriptor_56ede974c0020f77, []int{0}
}
func (m *EndpointRecord) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *EndpointRecord) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_EndpointRecord.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *EndpointRecord) XXX_Merge(src proto.Message) {
        xxx_messageInfo_EndpointRecord.Merge(m, src)
}
func (m *EndpointRecord) XXX_Size() int {
        return m.Size()
}
func (m *EndpointRecord) XXX_DiscardUnknown() {
        xxx_messageInfo_EndpointRecord.DiscardUnknown(m)
}

var xxx_messageInfo_EndpointRecord proto.InternalMessageInfo

func (m *EndpointRecord) GetName() string {
        if m != nil {
                return m.Name
        }
        return ""
}

func (m *EndpointRecord) GetServiceName() string {
        if m != nil {
                return m.ServiceName
        }
        return ""
}

func (m *EndpointRecord) GetServiceID() string {
        if m != nil {
                return m.ServiceID
        }
        return ""
}

func (m *EndpointRecord) GetVirtualIP() string {
        if m != nil {
                return m.VirtualIP
        }
        return ""
}

func (m *EndpointRecord) GetEndpointIP() string {
        if m != nil {
                return m.EndpointIP
        }
        return ""
}

func (m *EndpointRecord) GetIngressPorts() []*PortConfig {
        if m != nil {
                return m.IngressPorts
        }
        return nil
}

func (m *EndpointRecord) GetAliases() []string {
        if m != nil {
                return m.Aliases
        }
        return nil
}

func (m *EndpointRecord) GetTaskAliases() []string {
        if m != nil {
                return m.TaskAliases
        }
        return nil
}

func (m *EndpointRecord) GetServiceDisabled() bool {
        if m != nil {
                return m.ServiceDisabled
        }
        return false
}

// PortConfig specifies an exposed port which can be
// addressed using the given name. This can be later queried
// using a service discovery api or a DNS SRV query. The node
// port specifies a port that can be used to address this
// service external to the cluster by sending a connection
// request to this port to any node on the cluster.
type PortConfig struct {
        // Name for the port. If provided the port information can
        // be queried using the name as in a DNS SRV query.
        Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
        // Protocol for the port which is exposed.
        Protocol PortConfig_Protocol `protobuf:"varint,2,opt,name=protocol,proto3,enum=libnetwork.PortConfig_Protocol" json:"protocol,omitempty"`
        // The port which the application is exposing and is bound to.
        TargetPort uint32 `protobuf:"varint,3,opt,name=target_port,json=targetPort,proto3" json:"target_port,omitempty"`
        // PublishedPort specifies the port on which the service is
        // exposed on all nodes on the cluster. If not specified an
        // arbitrary port in the node port range is allocated by the
        // system. If specified it should be within the node port
        // range and it should be available.
        PublishedPort uint32 `protobuf:"varint,4,opt,name=published_port,json=publishedPort,proto3" json:"published_port,omitempty"`
}

func (m *PortConfig) Reset()      { *m = PortConfig{} }
func (*PortConfig) ProtoMessage() {}
func (*PortConfig) Descriptor() ([]byte, []int) {
        return fileDescriptor_56ede974c0020f77, []int{1}
}
func (m *PortConfig) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *PortConfig) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_PortConfig.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *PortConfig) XXX_Merge(src proto.Message) {
        xxx_messageInfo_PortConfig.Merge(m, src)
}
func (m *PortConfig) XXX_Size() int {
        return m.Size()
}
func (m *PortConfig) XXX_DiscardUnknown() {
        xxx_messageInfo_PortConfig.DiscardUnknown(m)
}

var xxx_messageInfo_PortConfig proto.InternalMessageInfo

func (m *PortConfig) GetName() string {
        if m != nil {
                return m.Name
        }
        return ""
}

func (m *PortConfig) GetProtocol() PortConfig_Protocol {
        if m != nil {
                return m.Protocol
        }
        return ProtocolTCP
}

func (m *PortConfig) GetTargetPort() uint32 {
        if m != nil {
                return m.TargetPort
        }
        return 0
}

func (m *PortConfig) GetPublishedPort() uint32 {
        if m != nil {
                return m.PublishedPort
        }
        return 0
}

func init() {
        proto.RegisterEnum("libnetwork.PortConfig_Protocol", PortConfig_Protocol_name, PortConfig_Protocol_value)
        proto.RegisterType((*EndpointRecord)(nil), "libnetwork.EndpointRecord")
        proto.RegisterType((*PortConfig)(nil), "libnetwork.PortConfig")
}

func init() { proto.RegisterFile("agent.proto", fileDescriptor_56ede974c0020f77) }

var fileDescriptor_56ede974c0020f77 = []byte{
        // 486 bytes of a gzipped FileDescriptorProto
        0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x6c, 0x91, 0x31, 0x6f, 0xd3, 0x40,
        0x18, 0x86, 0xed, 0x24, 0xb4, 0xf1, 0xe7, 0x24, 0x8d, 0x6e, 0x40, 0x56, 0x86, 0x8b, 0x89, 0x40,
        0x0a, 0x12, 0x38, 0x52, 0x19, 0x3b, 0xd1, 0x84, 0xc1, 0x0b, 0xb2, 0xae, 0x29, 0x6b, 0xb0, 0xe3,
        0xab, 0x7b, 0xaa, 0xeb, 0xb3, 0x7c, 0x97, 0xb2, 0xb2, 0x81, 0x3a, 0xf1, 0x07, 0x3a, 0xf1, 0x67,
        0x18, 0x3b, 0x76, 0xaa, 0xa8, 0xf3, 0x07, 0x58, 0xd9, 0xd0, 0x9d, 0xed, 0x46, 0x48, 0xdd, 0x7c,
        0xcf, 0xfb, 0x7c, 0xd6, 0x77, 0xef, 0x81, 0x1d, 0x26, 0x34, 0x93, 0x5e, 0x5e, 0x70, 0xc9, 0x11,
        0xa4, 0x2c, 0xca, 0xa8, 0xfc, 0xc2, 0x8b, 0x8b, 0xd1, 0xdb, 0x84, 0xc9, 0xf3, 0x4d, 0xe4, 0xad,
        0xf9, 0xe5, 0x2c, 0xe1, 0x09, 0x9f, 0x69, 0x25, 0xda, 0x9c, 0xe9, 0x93, 0x3e, 0xe8, 0xaf, 0x6a,
        0x74, 0xf2, 0xb7, 0x05, 0x83, 0x0f, 0x59, 0x9c, 0x73, 0x96, 0x49, 0x42, 0xd7, 0xbc, 0x88, 0x11,
        0x82, 0x4e, 0x16, 0x5e, 0x52, 0xc7, 0x74, 0xcd, 0xa9, 0x45, 0xf4, 0x37, 0x7a, 0x01, 0x3d, 0x41,
        0x8b, 0x2b, 0xb6, 0xa6, 0x2b, 0x9d, 0xb5, 0x74, 0x66, 0xd7, 0xec, 0xa3, 0x52, 0xde, 0x00, 0x34,
        0x0a, 0x8b, 0x9d, 0xb6, 0x12, 0x8e, 0xfb, 0xe5, 0xfd, 0xd8, 0x3a, 0xa9, 0xa8, 0xbf, 0x20, 0x56,
        0x2d, 0xf8, 0xb1, 0xb2, 0xaf, 0x58, 0x21, 0x37, 0x61, 0xba, 0x62, 0xb9, 0xd3, 0xd9, 0xd9, 0x9f,
        0x2a, 0xea, 0x07, 0xc4, 0xaa, 0x05, 0x3f, 0x47, 0x33, 0xb0, 0x69, 0xbd, 0xa4, 0xd2, 0x9f, 0x69,
        0x7d, 0x50, 0xde, 0x8f, 0xa1, 0xd9, 0xdd, 0x0f, 0x08, 0x34, 0x8a, 0x9f, 0xa3, 0x23, 0xe8, 0xb3,
        0x2c, 0x29, 0xa8, 0x10, 0xab, 0x9c, 0x17, 0x52, 0x38, 0x7b, 0x6e, 0x7b, 0x6a, 0x1f, 0x3e, 0xf7,
        0x76, 0x4d, 0x79, 0x01, 0x2f, 0xe4, 0x9c, 0x67, 0x67, 0x2c, 0x21, 0xbd, 0x5a, 0x56, 0x48, 0x20,
        0x07, 0xf6, 0xc3, 0x94, 0x85, 0x82, 0x0a, 0x67, 0xdf, 0x6d, 0x4f, 0x2d, 0xd2, 0x1c, 0x55, 0x0d,
        0x32, 0x14, 0x17, 0xab, 0x26, 0xee, 0xea, 0xd8, 0x56, 0xec, 0x7d, 0xad, 0xbc, 0x86, 0x61, 0x53,
        0x43, 0xcc, 0x44, 0x18, 0xa5, 0x34, 0x76, 0x2c, 0xd7, 0x9c, 0x76, 0xc9, 0x41, 0xcd, 0x17, 0x35,
        0x9e, 0x7c, 0x6b, 0x01, 0xec, 0x96, 0x78, 0xb2, 0xf7, 0x23, 0xe8, 0xea, 0x77, 0x5a, 0xf3, 0x54,
        0x77, 0x3e, 0x38, 0x1c, 0x3f, 0x7d, 0x05, 0x2f, 0xa8, 0x35, 0xf2, 0x38, 0x80, 0xc6, 0x60, 0xcb,
        0xb0, 0x48, 0xa8, 0xd4, 0x1d, 0xe8, 0x27, 0xe9, 0x13, 0xa8, 0x90, 0x9a, 0x44, 0xaf, 0x60, 0x90,
        0x6f, 0xa2, 0x94, 0x89, 0x73, 0x1a, 0x57, 0x4e, 0x47, 0x3b, 0xfd, 0x47, 0xaa, 0xb4, 0xc9, 0x67,
        0xe8, 0x36, 0x7f, 0x47, 0x0e, 0xb4, 0x97, 0xf3, 0x60, 0x68, 0x8c, 0x0e, 0xae, 0x6f, 0x5c, 0xbb,
        0xc1, 0xcb, 0x79, 0xa0, 0x92, 0xd3, 0x45, 0x30, 0x34, 0xff, 0x4f, 0x4e, 0x17, 0x01, 0x1a, 0x41,
        0xe7, 0x64, 0xbe, 0x0c, 0x86, 0xad, 0xd1, 0xf0, 0xfa, 0xc6, 0xed, 0x35, 0x91, 0x62, 0xa3, 0xce,
        0xf7, 0x9f, 0xd8, 0x38, 0x7e, 0x79, 0xf7, 0x80, 0x8d, 0x3f, 0x0f, 0xd8, 0xfc, 0x5a, 0x62, 0xf3,
        0x57, 0x89, 0xcd, 0xdb, 0x12, 0x9b, 0xbf, 0x4b, 0x6c, 0xfe, 0xd8, 0x62, 0xe3, 0x76, 0x8b, 0x8d,
        0xbb, 0x2d, 0x36, 0xa2, 0x3d, 0x7d, 0xb3, 0x77, 0xff, 0x02, 0x00, 0x00, 0xff, 0xff, 0xc0, 0xc6,
        0x3a, 0x88, 0xfc, 0x02, 0x00, 0x00,
}

func (this *EndpointRecord) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 13)
        s = append(s, "&libnetwork.EndpointRecord{")
        s = append(s, "Name: "+fmt.Sprintf("%#v", this.Name)+",\n")
        s = append(s, "ServiceName: "+fmt.Sprintf("%#v", this.ServiceName)+",\n")
        s = append(s, "ServiceID: "+fmt.Sprintf("%#v", this.ServiceID)+",\n")
        s = append(s, "VirtualIP: "+fmt.Sprintf("%#v", this.VirtualIP)+",\n")
        s = append(s, "EndpointIP: "+fmt.Sprintf("%#v", this.EndpointIP)+",\n")
        if this.IngressPorts != nil {
                s = append(s, "IngressPorts: "+fmt.Sprintf("%#v", this.IngressPorts)+",\n")
        }
        s = append(s, "Aliases: "+fmt.Sprintf("%#v", this.Aliases)+",\n")
        s = append(s, "TaskAliases: "+fmt.Sprintf("%#v", this.TaskAliases)+",\n")
        s = append(s, "ServiceDisabled: "+fmt.Sprintf("%#v", this.ServiceDisabled)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *PortConfig) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 8)
        s = append(s, "&libnetwork.PortConfig{")
        s = append(s, "Name: "+fmt.Sprintf("%#v", this.Name)+",\n")
        s = append(s, "Protocol: "+fmt.Sprintf("%#v", this.Protocol)+",\n")
        s = append(s, "TargetPort: "+fmt.Sprintf("%#v", this.TargetPort)+",\n")
        s = append(s, "PublishedPort: "+fmt.Sprintf("%#v", this.PublishedPort)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func valueToGoStringAgent(v interface{}, typ string) string {
        rv := reflect.ValueOf(v)
        if rv.IsNil() {
                return "nil"
        }
        pv := reflect.Indirect(rv).Interface()
        return fmt.Sprintf("func(v %v) *%v { return &v } ( %#v )", typ, typ, pv)
}
func (m *EndpointRecord) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *EndpointRecord) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *EndpointRecord) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.ServiceDisabled {
                i--
                if m.ServiceDisabled {
                        dAtA[i] = 1
                } else {
                        dAtA[i] = 0
                }
                i--
                dAtA[i] = 0x48
        }
        if len(m.TaskAliases) > 0 {
                for iNdEx := len(m.TaskAliases) - 1; iNdEx >= 0; iNdEx-- {
                        i -= len(m.TaskAliases[iNdEx])
                        copy(dAtA[i:], m.TaskAliases[iNdEx])
                        i = encodeVarintAgent(dAtA, i, uint64(len(m.TaskAliases[iNdEx])))
                        i--
                        dAtA[i] = 0x42
                }
        }
        if len(m.Aliases) > 0 {
                for iNdEx := len(m.Aliases) - 1; iNdEx >= 0; iNdEx-- {
                        i -= len(m.Aliases[iNdEx])
                        copy(dAtA[i:], m.Aliases[iNdEx])
                        i = encodeVarintAgent(dAtA, i, uint64(len(m.Aliases[iNdEx])))
                        i--
                        dAtA[i] = 0x3a
                }
        }
        if len(m.IngressPorts) > 0 {
                for iNdEx := len(m.IngressPorts) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.IngressPorts[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintAgent(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x32
                }
        }
        if len(m.EndpointIP) > 0 {
                i -= len(m.EndpointIP)
                copy(dAtA[i:], m.EndpointIP)
                i = encodeVarintAgent(dAtA, i, uint64(len(m.EndpointIP)))
                i--
                dAtA[i] = 0x2a
        }
        if len(m.VirtualIP) > 0 {
                i -= len(m.VirtualIP)
                copy(dAtA[i:], m.VirtualIP)
                i = encodeVarintAgent(dAtA, i, uint64(len(m.VirtualIP)))
                i--
                dAtA[i] = 0x22
        }
        if len(m.ServiceID) > 0 {
                i -= len(m.ServiceID)
                copy(dAtA[i:], m.ServiceID)
                i = encodeVarintAgent(dAtA, i, uint64(len(m.ServiceID)))
                i--
                dAtA[i] = 0x1a
        }
        if len(m.ServiceName) > 0 {
                i -= len(m.ServiceName)
                copy(dAtA[i:], m.ServiceName)
                i = encodeVarintAgent(dAtA, i, uint64(len(m.ServiceName)))
                i--
                dAtA[i] = 0x12
        }
        if len(m.Name) > 0 {
                i -= len(m.Name)
                copy(dAtA[i:], m.Name)
                i = encodeVarintAgent(dAtA, i, uint64(len(m.Name)))
                i--
                dAtA[i] = 0xa
        }
        return len(dAtA) - i, nil
}

func (m *PortConfig) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *PortConfig) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *PortConfig) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.PublishedPort != 0 {
                i = encodeVarintAgent(dAtA, i, uint64(m.PublishedPort))
                i--
                dAtA[i] = 0x20
        }
        if m.TargetPort != 0 {
                i = encodeVarintAgent(dAtA, i, uint64(m.TargetPort))
                i--
                dAtA[i] = 0x18
        }
        if m.Protocol != 0 {
                i = encodeVarintAgent(dAtA, i, uint64(m.Protocol))
                i--
                dAtA[i] = 0x10
        }
        if len(m.Name) > 0 {
                i -= len(m.Name)
                copy(dAtA[i:], m.Name)
                i = encodeVarintAgent(dAtA, i, uint64(len(m.Name)))
                i--
                dAtA[i] = 0xa
        }
        return len(dAtA) - i, nil
}

func encodeVarintAgent(dAtA []byte, offset int, v uint64) int {
        offset -= sovAgent(v)
        base := offset
        for v >= 1<<7 {
                dAtA[offset] = uint8(v&0x7f | 0x80)
                v >>= 7
                offset++
        }
        dAtA[offset] = uint8(v)
        return base
}
func (m *EndpointRecord) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.Name)
        if l > 0 {
                n += 1 + l + sovAgent(uint64(l))
        }
        l = len(m.ServiceName)
        if l > 0 {
                n += 1 + l + sovAgent(uint64(l))
        }
        l = len(m.ServiceID)
        if l > 0 {
                n += 1 + l + sovAgent(uint64(l))
        }
        l = len(m.VirtualIP)
        if l > 0 {
                n += 1 + l + sovAgent(uint64(l))
        }
        l = len(m.EndpointIP)
        if l > 0 {
                n += 1 + l + sovAgent(uint64(l))
        }
        if len(m.IngressPorts) > 0 {
                for _, e := range m.IngressPorts {
                        l = e.Size()
                        n += 1 + l + sovAgent(uint64(l))
                }
        }
        if len(m.Aliases) > 0 {
                for _, s := range m.Aliases {
                        l = len(s)
                        n += 1 + l + sovAgent(uint64(l))
                }
        }
        if len(m.TaskAliases) > 0 {
                for _, s := range m.TaskAliases {
                        l = len(s)
                        n += 1 + l + sovAgent(uint64(l))
                }
        }
        if m.ServiceDisabled {
                n += 2
        }
        return n
}

func (m *PortConfig) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.Name)
        if l > 0 {
                n += 1 + l + sovAgent(uint64(l))
        }
        if m.Protocol != 0 {
                n += 1 + sovAgent(uint64(m.Protocol))
        }
        if m.TargetPort != 0 {
                n += 1 + sovAgent(uint64(m.TargetPort))
        }
        if m.PublishedPort != 0 {
                n += 1 + sovAgent(uint64(m.PublishedPort))
        }
        return n
}

func sovAgent(x uint64) (n int) {
        return (math_bits.Len64(x|1) + 6) / 7
}
func sozAgent(x uint64) (n int) {
        return sovAgent(uint64((x << 1) ^ uint64((int64(x) >> 63))))
}
func (this *EndpointRecord) String() string {
        if this == nil {
                return "nil"
        }
        repeatedStringForIngressPorts := "[]*PortConfig{"
        for _, f := range this.IngressPorts {
                repeatedStringForIngressPorts += strings.Replace(f.String(), "PortConfig", "PortConfig", 1) + ","
        }
        repeatedStringForIngressPorts += "}"
        s := strings.Join([]string{`&EndpointRecord{`,
                `Name:` + fmt.Sprintf("%v", this.Name) + `,`,
                `ServiceName:` + fmt.Sprintf("%v", this.ServiceName) + `,`,
                `ServiceID:` + fmt.Sprintf("%v", this.ServiceID) + `,`,
                `VirtualIP:` + fmt.Sprintf("%v", this.VirtualIP) + `,`,
                `EndpointIP:` + fmt.Sprintf("%v", this.EndpointIP) + `,`,
                `IngressPorts:` + repeatedStringForIngressPorts + `,`,
                `Aliases:` + fmt.Sprintf("%v", this.Aliases) + `,`,
                `TaskAliases:` + fmt.Sprintf("%v", this.TaskAliases) + `,`,
                `ServiceDisabled:` + fmt.Sprintf("%v", this.ServiceDisabled) + `,`,
                `}`,
        }, "")
        return s
}
func (this *PortConfig) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&PortConfig{`,
                `Name:` + fmt.Sprintf("%v", this.Name) + `,`,
                `Protocol:` + fmt.Sprintf("%v", this.Protocol) + `,`,
                `TargetPort:` + fmt.Sprintf("%v", this.TargetPort) + `,`,
                `PublishedPort:` + fmt.Sprintf("%v", this.PublishedPort) + `,`,
                `}`,
        }, "")
        return s
}
func valueToStringAgent(v interface{}) string {
        rv := reflect.ValueOf(v)
        if rv.IsNil() {
                return "nil"
        }
        pv := reflect.Indirect(rv).Interface()
        return fmt.Sprintf("*%v", pv)
}
func (m *EndpointRecord) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowAgent
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: EndpointRecord: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: EndpointRecord: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Name = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field ServiceName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.ServiceName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field ServiceID", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.ServiceID = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 4:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field VirtualIP", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.VirtualIP = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 5:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field EndpointIP", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.EndpointIP = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 6:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field IngressPorts", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.IngressPorts = append(m.IngressPorts, &PortConfig{})
                        if err := m.IngressPorts[len(m.IngressPorts)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 7:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Aliases", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Aliases = append(m.Aliases, string(dAtA[iNdEx:postIndex]))
                        iNdEx = postIndex
                case 8:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field TaskAliases", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.TaskAliases = append(m.TaskAliases, string(dAtA[iNdEx:postIndex]))
                        iNdEx = postIndex
                case 9:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field ServiceDisabled", wireType)
                        }
                        var v int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                v |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        m.ServiceDisabled = bool(v != 0)
                default:
                        iNdEx = preIndex
                        skippy, err := skipAgent(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *PortConfig) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowAgent
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: PortConfig: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: PortConfig: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Name", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthAgent
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Name = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 2:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Protocol", wireType)
                        }
                        m.Protocol = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.Protocol |= PortConfig_Protocol(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 3:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field TargetPort", wireType)
                        }
                        m.TargetPort = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.TargetPort |= uint32(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 4:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field PublishedPort", wireType)
                        }
                        m.PublishedPort = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.PublishedPort |= uint32(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                default:
                        iNdEx = preIndex
                        skippy, err := skipAgent(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthAgent
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func skipAgent(dAtA []byte) (n int, err error) {
        l := len(dAtA)
        iNdEx := 0
        depth := 0
        for iNdEx < l {
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return 0, ErrIntOverflowAgent
                        }
                        if iNdEx >= l {
                                return 0, io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= (uint64(b) & 0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                wireType := int(wire & 0x7)
                switch wireType {
                case 0:
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                iNdEx++
                                if dAtA[iNdEx-1] < 0x80 {
                                        break
                                }
                        }
                case 1:
                        iNdEx += 8
                case 2:
                        var length int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowAgent
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                length |= (int(b) & 0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if length < 0 {
                                return 0, ErrInvalidLengthAgent
                        }
                        iNdEx += length
                case 3:
                        depth++
                case 4:
                        if depth == 0 {
                                return 0, ErrUnexpectedEndOfGroupAgent
                        }
                        depth--
                case 5:
                        iNdEx += 4
                default:
                        return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
                }
                if iNdEx < 0 {
                        return 0, ErrInvalidLengthAgent
                }
                if depth == 0 {
                        return iNdEx, nil
                }
        }
        return 0, io.ErrUnexpectedEOF
}

var (
        ErrInvalidLengthAgent        = fmt.Errorf("proto: negative length found during unmarshaling")
        ErrIntOverflowAgent          = fmt.Errorf("proto: integer overflow")
        ErrUnexpectedEndOfGroupAgent = fmt.Errorf("proto: unexpected end of group")
)

// Package bitmap provides a datatype for long vectors of bits.
package bitmap

import (
        "encoding/binary"
        "encoding/json"
        "errors"
        "fmt"
)

// block sequence constants
// If needed we can think of making these configurable
const (
        blockLen      = uint32(32)
        blockBytes    = uint64(blockLen / 8)
        blockMAX      = uint32(1<<blockLen - 1)
        blockFirstBit = uint32(1) << (blockLen - 1)
        invalidPos    = uint64(0xFFFFFFFFFFFFFFFF)
)

var (
        // ErrNoBitAvailable is returned when no more bits are available to set
        ErrNoBitAvailable = errors.New("no bit available")
        // ErrBitAllocated is returned when the specific bit requested is already set
        ErrBitAllocated = errors.New("requested bit is already allocated")
)

// https://github.com/golang/go/issues/8005#issuecomment-190753527
type noCopy struct{}

func (noCopy) Lock() {}

// Bitmap is a fixed-length bit vector. It is not safe for concurrent use.
//
// The data is stored as a list of run-length encoded blocks. It operates
// directly on the encoded representation, without decompressing.
type Bitmap struct {
        bits       uint64
        unselected uint64
        head       *sequence
        curr       uint64

        // Shallow copies would share the same head pointer but a copy of the
        // unselected count. Mutating the sequence through one would change the
        // bits for all copies but only update that one copy's unselected count,
        // which would result in subtle bugs.
        noCopy noCopy
}

// New returns a new Bitmap of ordinals in the interval [0, n).
func New(n uint64) *Bitmap {
        return &Bitmap{
                bits:       n,
                unselected: n,
                head: &sequence{
                        block: 0x0,
                        count: getNumBlocks(n),
                },
        }
}

// Copy returns a deep copy of b.
func Copy(b *Bitmap) *Bitmap {
        return &Bitmap{
                bits:       b.bits,
                unselected: b.unselected,
                head:       b.head.getCopy(),
                curr:       b.curr,
        }
}

// sequence represents a recurring sequence of 32 bits long bitmasks
type sequence struct {
        block uint32    // block is a symbol representing 4 byte long allocation bitmask
        count uint64    // number of consecutive blocks (symbols)
        next  *sequence // next sequence
}

// String returns a string representation of the block sequence starting from this block
func (s *sequence) toString() string {
        var nextBlock string
        if s.next == nil {
                nextBlock = "end"
        } else {
                nextBlock = s.next.toString()
        }
        return fmt.Sprintf("(0x%x, %d)->%s", s.block, s.count, nextBlock)
}

// GetAvailableBit returns the position of the first unset bit in the bitmask represented by this sequence
func (s *sequence) getAvailableBit(from uint64) (uint64, uint64, error) {
        if s.block == blockMAX || s.count == 0 {
                return invalidPos, invalidPos, ErrNoBitAvailable
        }
        bits := from
        bitSel := blockFirstBit >> from
        for bitSel > 0 && s.block&bitSel != 0 {
                bitSel >>= 1
                bits++
        }
        // Check if the loop exited because it could not
        // find any available bit int block  starting from
        // "from". Return invalid pos in that case.
        if bitSel == 0 {
                return invalidPos, invalidPos, ErrNoBitAvailable
        }
        return bits / 8, bits % 8, nil
}

// GetCopy returns a copy of the linked list rooted at this node
func (s *sequence) getCopy() *sequence {
        n := &sequence{block: s.block, count: s.count}
        pn := n
        ps := s.next
        for ps != nil {
                pn.next = &sequence{block: ps.block, count: ps.count}
                pn = pn.next
                ps = ps.next
        }
        return n
}

// Equal checks if this sequence is equal to the passed one
func (s *sequence) equal(o *sequence) bool {
        this := s
        other := o
        for this != nil {
                if other == nil {
                        return false
                }
                if this.block != other.block || this.count != other.count {
                        return false
                }
                this = this.next
                other = other.next
        }
        return other == nil
}

// ToByteArray converts the sequence into a byte array
func (s *sequence) toByteArray() ([]byte, error) {
        var bb []byte

        p := s
        b := make([]byte, 12)
        for p != nil {
                binary.BigEndian.PutUint32(b[0:], p.block)
                binary.BigEndian.PutUint64(b[4:], p.count)
                bb = append(bb, b...)
                p = p.next
        }

        return bb, nil
}

// fromByteArray construct the sequence from the byte array
func (s *sequence) fromByteArray(data []byte) error {
        l := len(data)
        if l%12 != 0 {
                return fmt.Errorf("cannot deserialize byte sequence of length %d (%v)", l, data)
        }

        p := s
        i := 0
        for {
                p.block = binary.BigEndian.Uint32(data[i : i+4])
                p.count = binary.BigEndian.Uint64(data[i+4 : i+12])
                i += 12
                if i == l {
                        break
                }
                p.next = &sequence{}
                p = p.next
        }

        return nil
}

// SetAnyInRange sets the first unset bit in the range [start, end] and returns
// the ordinal of the set bit.
//
// When serial=true, the bitmap is scanned starting from the ordinal following
// the bit most recently set by [Bitmap.SetAny] or [Bitmap.SetAnyInRange].
func (h *Bitmap) SetAnyInRange(start, end uint64, serial bool) (uint64, error) {
        if end < start || end >= h.bits {
                return invalidPos, fmt.Errorf("invalid bit range [%d, %d)", start, end)
        }
        if h.Unselected() == 0 {
                return invalidPos, ErrNoBitAvailable
        }
        return h.set(0, start, end, true, false, serial)
}

// SetAny sets the first unset bit in the sequence and returns the ordinal of
// the set bit.
//
// When serial=true, the bitmap is scanned starting from the ordinal following
// the bit most recently set by [Bitmap.SetAny] or [Bitmap.SetAnyInRange].
func (h *Bitmap) SetAny(serial bool) (uint64, error) {
        if h.Unselected() == 0 {
                return invalidPos, ErrNoBitAvailable
        }
        return h.set(0, 0, h.bits-1, true, false, serial)
}

// Set atomically sets the corresponding bit in the sequence
func (h *Bitmap) Set(ordinal uint64) error {
        if err := h.validateOrdinal(ordinal); err != nil {
                return err
        }
        _, err := h.set(ordinal, 0, 0, false, false, false)
        return err
}

// Unset atomically unsets the corresponding bit in the sequence
func (h *Bitmap) Unset(ordinal uint64) error {
        if err := h.validateOrdinal(ordinal); err != nil {
                return err
        }
        _, err := h.set(ordinal, 0, 0, false, true, false)
        return err
}

// IsSet atomically checks if the ordinal bit is set. In case ordinal
// is outside of the bit sequence limits, false is returned.
func (h *Bitmap) IsSet(ordinal uint64) bool {
        if err := h.validateOrdinal(ordinal); err != nil {
                return false
        }
        _, _, err := checkIfAvailable(h.head, ordinal)
        return err != nil
}

// set/reset the bit
func (h *Bitmap) set(ordinal, start, end uint64, isAvailable bool, release bool, serial bool) (uint64, error) {
        var (
                bitPos  uint64
                bytePos uint64
                ret     uint64
                err     error
        )

        curr := uint64(0)
        if serial {
                curr = h.curr
        }
        // Get position if available
        if release {
                bytePos, bitPos = ordinalToPos(ordinal)
        } else {
                if isAvailable {
                        bytePos, bitPos, err = getAvailableFromCurrent(h.head, start, curr, end)
                        ret = posToOrdinal(bytePos, bitPos)
                        if err == nil {
                                h.curr = ret + 1
                        }
                } else {
                        bytePos, bitPos, err = checkIfAvailable(h.head, ordinal)
                        ret = ordinal
                }
        }
        if err != nil {
                return ret, err
        }

        var changed bool
        h.head, changed = pushReservation(bytePos, bitPos, h.head, release)
        if changed {
                if release {
                        h.unselected++
                } else {
                        h.unselected--
                }
        }

        return ret, nil
}

// checks is needed because to cover the case where the number of bits is not a multiple of blockLen
func (h *Bitmap) validateOrdinal(ordinal uint64) error {
        if ordinal >= h.bits {
                return errors.New("bit does not belong to the sequence")
        }
        return nil
}

// MarshalBinary encodes h into a binary representation.
func (h *Bitmap) MarshalBinary() ([]byte, error) {
        bm, err := h.head.toByteArray()
        if err != nil {
                return nil, fmt.Errorf("failed to serialize head: %v", err)
        }

        // Pre-allocate capacity for "bits" and "unselected" (16 bytes) and head.
        ba := make([]byte, 0, 16+len(bm))
        ba = binary.BigEndian.AppendUint64(ba, h.bits)
        ba = binary.BigEndian.AppendUint64(ba, h.unselected)
        ba = append(ba, bm...)
        return ba, nil
}

// UnmarshalBinary decodes a binary representation of a Bitmap value which was
// generated using [Bitmap.MarshalBinary].
//
// The scan position for serial [Bitmap.SetAny] and [Bitmap.SetAnyInRange]
// operations is neither unmarshaled nor reset.
func (h *Bitmap) UnmarshalBinary(ba []byte) error {
        if ba == nil {
                return errors.New("nil byte array")
        }

        nh := &sequence{}
        err := nh.fromByteArray(ba[16:])
        if err != nil {
                return fmt.Errorf("failed to deserialize head: %v", err)
        }

        h.head = nh
        h.bits = binary.BigEndian.Uint64(ba[0:8])
        h.unselected = binary.BigEndian.Uint64(ba[8:16])
        return nil
}

// Bits returns the length of the bit sequence
func (h *Bitmap) Bits() uint64 {
        return h.bits
}

// Unselected returns the number of bits which are not selected
func (h *Bitmap) Unselected() uint64 {
        return h.unselected
}

func (h *Bitmap) String() string {
        return fmt.Sprintf("Bits: %d, Unselected: %d, Sequence: %s Curr:%d",
                h.bits, h.unselected, h.head.toString(), h.curr)
}

// MarshalJSON encodes h into a JSON message
func (h *Bitmap) MarshalJSON() ([]byte, error) {
        b, err := h.MarshalBinary()
        if err != nil {
                return nil, err
        }
        return json.Marshal(b)
}

// UnmarshalJSON decodes JSON message into h
func (h *Bitmap) UnmarshalJSON(data []byte) error {
        var b []byte
        if err := json.Unmarshal(data, &b); err != nil {
                return err
        }
        return h.UnmarshalBinary(b)
}

// getFirstAvailable looks for the first unset bit in passed mask starting from start
func getFirstAvailable(head *sequence, start uint64) (uint64, uint64, error) {
        // Find sequence which contains the start bit
        byteStart, bitStart := ordinalToPos(start)
        current, _, precBlocks, inBlockBytePos := findSequence(head, byteStart)
        // Derive the this sequence offsets
        byteOffset := byteStart - inBlockBytePos
        bitOffset := inBlockBytePos*8 + bitStart
        for current != nil {
                if current.block != blockMAX {
                        // If the current block is not full, check if there is any bit
                        // from the current bit in the current block. If not, before proceeding to the
                        // next block node, make sure we check for available bit in the next
                        // instance of the same block. Due to RLE same block signature will be
                        // compressed.
                retry:
                        bytePos, bitPos, err := current.getAvailableBit(bitOffset)
                        if err != nil && precBlocks == current.count-1 {
                                // This is the last instance in the same block node,
                                // so move to the next block.
                                goto next
                        }
                        if err != nil {
                                // There are some more instances of the same block, so add the offset
                                // and be optimistic that you will find the available bit in the next
                                // instance of the same block.
                                bitOffset = 0
                                byteOffset += blockBytes
                                precBlocks++
                                goto retry
                        }
                        return byteOffset + bytePos, bitPos, err
                }
                // Moving to next block: Reset bit offset.
        next:
                bitOffset = 0
                byteOffset += (current.count * blockBytes) - (precBlocks * blockBytes)
                precBlocks = 0
                current = current.next
        }
        return invalidPos, invalidPos, ErrNoBitAvailable
}

// getAvailableFromCurrent will look for available ordinal from the current ordinal.
// If none found then it will loop back to the start to check of the available bit.
// This can be further optimized to check from start till curr in case of a rollover
func getAvailableFromCurrent(head *sequence, start, curr, end uint64) (uint64, uint64, error) {
        var bytePos, bitPos uint64
        var err error
        if curr != 0 && curr > start {
                bytePos, bitPos, err = getFirstAvailable(head, curr)
                ret := posToOrdinal(bytePos, bitPos)
                if end < ret || err != nil {
                        goto begin
                }
                return bytePos, bitPos, nil
        }

begin:
        bytePos, bitPos, err = getFirstAvailable(head, start)
        ret := posToOrdinal(bytePos, bitPos)
        if end < ret || err != nil {
                return invalidPos, invalidPos, ErrNoBitAvailable
        }
        return bytePos, bitPos, nil
}

// checkIfAvailable checks if the bit correspondent to the specified ordinal is unset
// If the ordinal is beyond the sequence limits, a negative response is returned
func checkIfAvailable(head *sequence, ordinal uint64) (uint64, uint64, error) {
        bytePos, bitPos := ordinalToPos(ordinal)

        // Find the sequence containing this byte
        current, _, _, inBlockBytePos := findSequence(head, bytePos)
        if current != nil {
                // Check whether the bit corresponding to the ordinal address is unset
                bitSel := blockFirstBit >> (inBlockBytePos*8 + bitPos)
                if current.block&bitSel == 0 {
                        return bytePos, bitPos, nil
                }
        }

        return invalidPos, invalidPos, ErrBitAllocated
}

// Given the byte position and the sequences list head, return the pointer to the
// sequence containing the byte (current), the pointer to the previous sequence,
// the number of blocks preceding the block containing the byte inside the current sequence.
// If bytePos is outside of the list, function will return (nil, nil, 0, invalidPos)
func findSequence(head *sequence, bytePos uint64) (*sequence, *sequence, uint64, uint64) {
        // Find the sequence containing this byte
        previous := head
        current := head
        n := bytePos
        for current.next != nil && n >= (current.count*blockBytes) { // Nil check for less than 32 addresses masks
                n -= (current.count * blockBytes)
                previous = current
                current = current.next
        }

        // If byte is outside of the list, let caller know
        if n >= (current.count * blockBytes) {
                return nil, nil, 0, invalidPos
        }

        // Find the byte position inside the block and the number of blocks
        // preceding the block containing the byte inside this sequence
        precBlocks := n / blockBytes
        inBlockBytePos := bytePos % blockBytes

        return current, previous, precBlocks, inBlockBytePos
}

// PushReservation pushes the bit reservation inside the bitmask.
// Given byte and bit positions, identify the sequence (current) which holds the block containing the affected bit.
// Create a new block with the modified bit according to the operation (allocate/release).
// Create a new sequence containing the new block and insert it in the proper position.
// Remove current sequence if empty.
// Check if new sequence can be merged with neighbour (previous/next) sequences.
//
// Identify "current" sequence containing block:
//
//        [prev seq] [current seq] [next seq]
//
// Based on block position, resulting list of sequences can be any of three forms:
//
// block position                        Resulting list of sequences
//
// A) block is first in current:         [prev seq] [new] [modified current seq] [next seq]
// B) block is last in current:          [prev seq] [modified current seq] [new] [next seq]
// C) block is in the middle of current: [prev seq] [curr pre] [new] [curr post] [next seq]
//
// Return value changed is true if the bit value was changed.
func pushReservation(bytePos, bitPos uint64, head *sequence, release bool) (_ *sequence, changed bool) {
        // Store list's head
        newHead := head

        // Find the sequence containing this byte
        current, previous, precBlocks, inBlockBytePos := findSequence(head, bytePos)
        if current == nil {
                return newHead, false
        }

        // Construct updated block
        bitSel := blockFirstBit >> (inBlockBytePos*8 + bitPos)
        newBlock := current.block
        if release {
                newBlock &^= bitSel
        } else {
                newBlock |= bitSel
        }

        // Quit if it was a redundant request
        if current.block == newBlock {
                return newHead, false
        }

        // Current sequence inevitably looses one block, update count
        current.count--

        // Create new sequence
        newSequence := &sequence{block: newBlock, count: 1}

        // Insert the new sequence in the list based on block position
        switch precBlocks {
        case 0: // First in sequence (A)
                newSequence.next = current
                if current == head {
                        newHead = newSequence
                        previous = newHead
                } else {
                        previous.next = newSequence
                }
                removeCurrentIfEmpty(&newHead, newSequence, current)
                mergeSequences(previous)
        case current.count: // Last in sequence (B)
                newSequence.next = current.next
                current.next = newSequence
                mergeSequences(current)
        default: // In between the sequence (C)
                currPre := &sequence{block: current.block, count: precBlocks, next: newSequence}
                currPost := current
                currPost.count -= precBlocks
                newSequence.next = currPost
                if currPost == head {
                        newHead = currPre
                } else {
                        previous.next = currPre
                }
                // No merging or empty current possible here
        }

        return newHead, true
}

// Removes the current sequence from the list if empty, adjusting the head pointer if needed
func removeCurrentIfEmpty(head **sequence, previous, current *sequence) {
        if current.count == 0 {
                if current == *head {
                        *head = current.next
                } else {
                        previous.next = current.next
                }
        }
}

// Given a pointer to a sequence, it checks if it can be merged with any following sequences
// It stops when no more merging is possible.
// TODO: Optimization: only attempt merge from start to end sequence, no need to scan till the end of the list
func mergeSequences(seq *sequence) {
        if seq != nil {
                // Merge all what possible from seq
                for seq.next != nil && seq.block == seq.next.block {
                        seq.count += seq.next.count
                        seq.next = seq.next.next
                }
                // Move to next
                mergeSequences(seq.next)
        }
}

func getNumBlocks(numBits uint64) uint64 {
        numBlocks := numBits / uint64(blockLen)
        if numBits%uint64(blockLen) != 0 {
                numBlocks++
        }
        return numBlocks
}

func ordinalToPos(ordinal uint64) (uint64, uint64) {
        return ordinal / 8, ordinal % 8
}

func posToOrdinal(bytePos, bitPos uint64) uint64 {
        return bytePos*8 + bitPos
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package config

import (
        "context"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/cluster"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/ipamutils"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/pkg/plugingetter"
)

const (
        warningThNetworkControlPlaneMTU = 1500
        minimumNetworkControlPlaneMTU   = 500
)

// Config encapsulates configurations of various Libnetwork components
type Config struct {
        DataDir string
        // ExecRoot is the base-path for libnetwork external key listeners
        // (created in "<ExecRoot>/libnetwork/<Controller-Short-ID>.sock"),
        // and is passed as "-exec-root: argument for "libnetwork-setkey".
        //
        // It is only used on Linux, but referenced in some "unix" files
        // (linux and freebsd).
        //
        // FIXME(thaJeztah): ExecRoot is only used for Controller.startExternalKeyListener(), but "libnetwork-setkey" is only implemented on Linux.
        ExecRoot               string
        DefaultNetwork         string
        DefaultDriver          string
        Labels                 []string
        driverCfg              map[string]map[string]any
        ClusterProvider        cluster.Provider
        NetworkControlPlaneMTU int
        DefaultAddressPool     []*ipamutils.NetworkToSplit
        DatastoreBucket        string
        ActiveSandboxes        map[string]any
        PluginGetter           plugingetter.PluginGetter
        FirewallBackend        string
        Rootless               bool
        EnableUserlandProxy    bool
        UserlandProxyPath      string
}

// New creates a new Config and initializes it with the given Options.
func New(opts ...Option) *Config {
        cfg := &Config{
                driverCfg:       make(map[string]map[string]any),
                DatastoreBucket: datastore.DefaultBucket,
        }

        for _, opt := range opts {
                if opt != nil {
                        opt(cfg)
                }
        }

        return cfg
}

func (c *Config) DriverConfig(name string) map[string]any {
        return c.driverCfg[name]
}

// Option is an option setter function type used to pass various configurations
// to the controller
type Option func(c *Config)

// OptionDefaultNetwork function returns an option setter for a default network
func OptionDefaultNetwork(dn string) Option {
        return func(c *Config) {
                log.G(context.TODO()).Debugf("Option DefaultNetwork: %s", dn)
                c.DefaultNetwork = strings.TrimSpace(dn)
        }
}

// OptionDefaultDriver function returns an option setter for default driver
func OptionDefaultDriver(dd string) Option {
        return func(c *Config) {
                log.G(context.TODO()).Debugf("Option DefaultDriver: %s", dd)
                c.DefaultDriver = strings.TrimSpace(dd)
        }
}

// OptionDefaultAddressPoolConfig function returns an option setter for default address pool
func OptionDefaultAddressPoolConfig(addressPool []*ipamutils.NetworkToSplit) Option {
        return func(c *Config) {
                c.DefaultAddressPool = addressPool
        }
}

// OptionDriverConfig returns an option setter for driver configuration.
func OptionDriverConfig(networkType string, config map[string]any) Option {
        return func(c *Config) {
                c.driverCfg[networkType] = config
        }
}

// OptionLabels function returns an option setter for labels
func OptionLabels(labels []string) Option {
        return func(c *Config) {
                for _, label := range labels {
                        if strings.HasPrefix(label, netlabel.Prefix) {
                                c.Labels = append(c.Labels, label)
                        }
                }
        }
}

// OptionDataDir function returns an option setter for data folder
func OptionDataDir(dataDir string) Option {
        return func(c *Config) {
                c.DataDir = dataDir
        }
}

// OptionExecRoot function returns an option setter for exec root folder.
//
// On Linux, it sets both the controller's ExecRoot and osl.basePath, whereas
// on FreeBSD, it only sets the controller's ExecRoot. It is a no-op on other
// platforms.
func OptionExecRoot(execRoot string) Option {
        return optionExecRoot(execRoot)
}

// OptionPluginGetter returns a plugingetter for remote drivers.
func OptionPluginGetter(pg plugingetter.PluginGetter) Option {
        return func(c *Config) {
                c.PluginGetter = pg
        }
}

// OptionNetworkControlPlaneMTU function returns an option setter for control plane MTU
func OptionNetworkControlPlaneMTU(exp int) Option {
        return func(c *Config) {
                log.G(context.TODO()).Debugf("Network Control Plane MTU: %d", exp)
                if exp < warningThNetworkControlPlaneMTU {
                        log.G(context.TODO()).Warnf("Received a MTU of %d, this value is very low, the network control plane can misbehave,"+
                                " defaulting to minimum value (%d)", exp, minimumNetworkControlPlaneMTU)
                        if exp < minimumNetworkControlPlaneMTU {
                                exp = minimumNetworkControlPlaneMTU
                        }
                }
                c.NetworkControlPlaneMTU = exp
        }
}

// OptionActiveSandboxes function returns an option setter for passing the sandboxes
// which were active during previous daemon life
func OptionActiveSandboxes(sandboxes map[string]any) Option {
        return func(c *Config) {
                c.ActiveSandboxes = sandboxes
        }
}

// OptionFirewallBackend returns an option setter for selection of the firewall backend.
func OptionFirewallBackend(val string) Option {
        return func(c *Config) {
                c.FirewallBackend = val
        }
}

// OptionRootless returns an option setter that indicates whether the daemon is
// running in rootless mode.
func OptionRootless(rootless bool) Option {
        return func(c *Config) {
                c.Rootless = rootless
        }
}

// OptionUserlandProxy returns an option setter that indicates whether the
// userland proxy is enabled, and sets the path to the proxy binary.
func OptionUserlandProxy(enabled bool, proxyPath string) Option {
        return func(c *Config) {
                c.EnableUserlandProxy = enabled
                c.UserlandProxyPath = proxyPath
        }
}

package config

import "github.com/docker/docker/daemon/libnetwork/osl"

// optionExecRoot on Linux sets both the controller's ExecRoot and osl.basePath.
func optionExecRoot(execRoot string) Option {
        return func(c *Config) {
                c.ExecRoot = execRoot
                osl.SetBasePath(execRoot)
        }
}

/*
Package libnetwork provides the basic functionality and extension points to
create network namespaces and allocate interfaces for containers to use.

        networkType := "bridge"

        // Create a new controller instance
        driverOptions := options.Generic{}
        genericOption := make(map[string]interface{})
        genericOption[netlabel.GenericData] = driverOptions
        controller, err := libnetwork.New(config.OptionDriverConfig(networkType, genericOption))
        if err != nil {
                return
        }

        // Create a network for containers to join.
        // NewNetwork accepts Variadic optional arguments that libnetwork and Drivers can make use of
        network, err := controller.NewNetwork(networkType, "network1", "")
        if err != nil {
                return
        }

        // For each new container: allocate IP and interfaces. The returned network
        // settings will be used for container infos (inspect and such), as well as
        // iptables rules for port publishing. This info is contained or accessible
        // from the returned endpoint.
        ep, err := network.CreateEndpoint(context.TODO(), "Endpoint1")
        if err != nil {
                return
        }

        // Create the sandbox for the container.
        // NewSandbox accepts Variadic optional arguments which libnetwork can use.
        sbx, err := controller.NewSandbox("container1",
                libnetwork.OptionHostname("test"),
                libnetwork.OptionDomainname("example.com"))

        // A sandbox can join the endpoint via the join api.
        err = ep.Join(sbx)
        if err != nil {
                return
        }
*/
package libnetwork

import (
        "context"
        "fmt"
        "net"
        "path/filepath"
        "runtime"
        "strings"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/cluster"
        "github.com/docker/docker/daemon/libnetwork/config"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/diagnostic"
        "github.com/docker/docker/daemon/libnetwork/discoverapi"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        remotedriver "github.com/docker/docker/daemon/libnetwork/drivers/remote"
        "github.com/docker/docker/daemon/libnetwork/drvregistry"
        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/ipams"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/otelutil"
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/locker"
        "github.com/pkg/errors"
        "go.opentelemetry.io/otel"
)

// NetworkWalker is a client provided function which will be used to walk the Networks.
// When the function returns true, the walk will stop.
type NetworkWalker func(nw *Network) bool

// Controller manages networks.
type Controller struct {
        id               string
        drvRegistry      drvregistry.Networks
        ipamRegistry     drvregistry.IPAMs
        pmRegistry       drvregistry.PortMappers
        sandboxes        map[string]*Sandbox
        cfg              *config.Config
        store            *datastore.Store
        extKeyListener   net.Listener
        svcRecords       map[string]*svcInfo
        serviceBindings  map[serviceKey]*service
        ingressSandbox   *Sandbox
        agent            *nwAgent
        networkLocker    *locker.Locker
        agentInitDone    chan struct{}
        agentStopDone    chan struct{}
        keys             []*types.EncryptionKey
        diagnosticServer *diagnostic.Server
        mu               sync.Mutex

        // networks is an in-memory cache of Network. Do not use this map unless
        // you're sure your code is thread-safe.
        //
        // The data persistence layer is instantiating new Network objects every
        // time it loads an object from its store or in-memory cache. This leads to
        // multiple instances representing the same network to concurrently live in
        // memory. As such, the Network mutex might be ineffective and not
        // correctly protect against data races.
        //
        // If you want to use this map for new or existing code, you need to make
        // sure: 1. the Network object is correctly locked; 2. the lock order
        // between Sandbox, Network and Endpoint is the same as the rest of the
        // code (in order to avoid deadlocks).
        networks map[string]*Network
        // networksMu protects the networks map.
        networksMu sync.Mutex

        // endpoints is an in-memory cache of Endpoint. Do not use this map unless
        // you're sure your code is thread-safe.
        //
        // The data persistence layer is instantiating new Endpoint objects every
        // time it loads an object from its store or in-memory cache. This leads to
        // multiple instances representing the same endpoint to concurrently live
        // in memory. As such, the Endpoint mutex might be ineffective and not
        // correctly protect against data races.
        //
        // If you want to use this map for new or existing code, you need to make
        // sure: 1. the Endpoint object is correctly locked; 2. the lock order
        // between Sandbox, Network and Endpoint is the same as the rest of the
        // code (in order to avoid deadlocks).
        endpoints map[string]*Endpoint
        // endpointsMu protects the endpoints map.
        endpointsMu sync.Mutex

        // FIXME(thaJeztah): defOsSbox is always nil on non-Linux: move these fields to Linux-only files.
        defOsSboxOnce sync.Once
        defOsSbox     *osl.Namespace
}

// New creates a new instance of network controller.
func New(ctx context.Context, cfgOptions ...config.Option) (_ *Controller, retErr error) {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.New")
        defer func() {
                otelutil.RecordStatus(span, retErr)
                span.End()
        }()

        cfg := config.New(cfgOptions...)
        store, err := datastore.New(cfg.DataDir, cfg.DatastoreBucket)
        if err != nil {
                return nil, fmt.Errorf("libnet controller initialization: %w", err)
        }

        c := &Controller{
                id:               stringid.GenerateRandomID(),
                cfg:              cfg,
                store:            store,
                sandboxes:        map[string]*Sandbox{},
                networks:         map[string]*Network{},
                endpoints:        map[string]*Endpoint{},
                svcRecords:       make(map[string]*svcInfo),
                serviceBindings:  make(map[serviceKey]*service),
                agentInitDone:    make(chan struct{}),
                networkLocker:    locker.New(),
                diagnosticServer: diagnostic.New(),
        }

        if err := c.selectFirewallBackend(); err != nil {
                return nil, err
        }
        c.drvRegistry.Notify = c

        // Register portmappers before network drivers to make sure they can
        // restore existing sandboxes (with port mappings) during their
        // initialization, if the daemon is started in live restore mode.
        if err := registerPortMappers(ctx, &c.pmRegistry, c.cfg); err != nil {
                return nil, err
        }

        // External plugins don't need config passed through daemon. They can
        // bootstrap themselves.
        if err := remotedriver.Register(&c.drvRegistry, c.cfg.PluginGetter); err != nil {
                return nil, err
        }

        if err := registerNetworkDrivers(&c.drvRegistry, c.store, &c.pmRegistry, c.makeDriverConfig); err != nil {
                return nil, err
        }

        if err := ipams.Register(&c.ipamRegistry, c.cfg.PluginGetter, c.cfg.DefaultAddressPool, nil); err != nil {
                return nil, err
        }

        c.WalkNetworks(func(nw *Network) bool {
                if n := nw; n.hasSpecialDriver() && !n.ConfigOnly() {
                        if err := n.getController().addNetwork(ctx, n); err != nil {
                                log.G(ctx).Warnf("Failed to populate network %q with driver %q", nw.Name(), nw.Type())
                        }
                }
                return false
        })

        // Reserve pools first before doing cleanup. Otherwise the
        // cleanups of endpoint/network and sandbox below will
        // generate many unnecessary warnings
        c.reservePools()

        if err := c.sandboxRestore(c.cfg.ActiveSandboxes); err != nil {
                log.G(ctx).WithError(err).Error("error during sandbox cleanup")
        }

        // Cleanup resources
        if err := c.cleanupLocalEndpoints(); err != nil {
                log.G(ctx).WithError(err).Warnf("error during endpoint cleanup")
        }
        c.networkCleanup()

        if err := c.startExternalKeyListener(); err != nil {
                return nil, err
        }

        c.setupUserChains()
        return c, nil
}

// SetClusterProvider sets the cluster provider.
func (c *Controller) SetClusterProvider(provider cluster.Provider) {
        var sameProvider bool
        c.mu.Lock()
        // Avoids to spawn multiple goroutine for the same cluster provider
        if c.cfg.ClusterProvider == provider {
                // If the cluster provider is already set, there is already a go routine spawned
                // that is listening for events, so nothing to do here
                sameProvider = true
        } else {
                c.cfg.ClusterProvider = provider
        }
        c.mu.Unlock()

        if provider == nil || sameProvider {
                return
        }
        // We don't want to spawn a new go routine if the previous one did not exit yet
        c.AgentStopWait()
        go c.clusterAgentInit()
}

// SetKeys configures the encryption key for gossip and overlay data path.
func (c *Controller) SetKeys(keys []*types.EncryptionKey) error {
        // libnetwork side of agent depends on the keys. On the first receipt of
        // keys setup the agent. For subsequent key set handle the key change
        subsysKeys := make(map[string]int)
        for _, key := range keys {
                if key.Subsystem != subsysGossip &&
                        key.Subsystem != subsysIPSec {
                        return errors.New("key received for unrecognized subsystem")
                }
                subsysKeys[key.Subsystem]++
        }
        for s, count := range subsysKeys {
                if count != keyringSize {
                        return fmt.Errorf("incorrect number of keys for subsystem %v", s)
                }
        }

        if c.getAgent() == nil {
                c.mu.Lock()
                c.keys = keys
                c.mu.Unlock()
                return nil
        }
        return c.handleKeyChange(keys)
}

func (c *Controller) getAgent() *nwAgent {
        c.mu.Lock()
        defer c.mu.Unlock()
        return c.agent
}

func (c *Controller) clusterAgentInit() {
        clusterProvider := c.cfg.ClusterProvider
        var keysAvailable bool
        for {
                eventType := <-clusterProvider.ListenClusterEvents()
                // The events: EventSocketChange, EventNodeReady and EventNetworkKeysAvailable are not ordered
                // when all the condition for the agent initialization are met then proceed with it
                switch eventType {
                case cluster.EventNetworkKeysAvailable:
                        // Validates that the keys are actually available before starting the initialization
                        // This will handle old spurious messages left on the channel
                        c.mu.Lock()
                        keysAvailable = c.keys != nil
                        c.mu.Unlock()
                        fallthrough
                case cluster.EventSocketChange, cluster.EventNodeReady:
                        if keysAvailable && c.isSwarmNode() {
                                c.agentOperationStart()
                                if err := c.agentSetup(clusterProvider); err != nil {
                                        c.agentStopComplete()
                                } else {
                                        c.agentInitComplete()
                                }
                        }
                case cluster.EventNodeLeave:
                        c.agentOperationStart()
                        c.mu.Lock()
                        c.keys = nil
                        c.mu.Unlock()

                        // We are leaving the cluster. Make sure we
                        // close the gossip so that we stop all
                        // incoming gossip updates before cleaning up
                        // any remaining service bindings. But before
                        // deleting the networks since the networks
                        // should still be present when cleaning up
                        // service bindings
                        c.agentClose()
                        c.cleanupServiceDiscovery("")
                        c.cleanupServiceBindings("")

                        c.agentStopComplete()

                        return
                }
        }
}

// AgentInitWait waits for agent initialization to be completed in the controller.
func (c *Controller) AgentInitWait() {
        c.mu.Lock()
        agentInitDone := c.agentInitDone
        c.mu.Unlock()

        if agentInitDone != nil {
                <-agentInitDone
        }
}

// AgentStopWait waits for the Agent stop to be completed in the controller.
func (c *Controller) AgentStopWait() {
        c.mu.Lock()
        agentStopDone := c.agentStopDone
        c.mu.Unlock()
        if agentStopDone != nil {
                <-agentStopDone
        }
}

// agentOperationStart marks the start of an Agent Init or Agent Stop
func (c *Controller) agentOperationStart() {
        c.mu.Lock()
        if c.agentInitDone == nil {
                c.agentInitDone = make(chan struct{})
        }
        if c.agentStopDone == nil {
                c.agentStopDone = make(chan struct{})
        }
        c.mu.Unlock()
}

// agentInitComplete notifies the successful completion of the Agent initialization
func (c *Controller) agentInitComplete() {
        c.mu.Lock()
        if c.agentInitDone != nil {
                close(c.agentInitDone)
                c.agentInitDone = nil
        }
        c.mu.Unlock()
}

// agentStopComplete notifies the successful completion of the Agent stop
func (c *Controller) agentStopComplete() {
        c.mu.Lock()
        if c.agentStopDone != nil {
                close(c.agentStopDone)
                c.agentStopDone = nil
        }
        c.mu.Unlock()
}

func (c *Controller) makeDriverConfig(ntype string) map[string]interface{} {
        if c.cfg == nil {
                return nil
        }

        cfg := map[string]interface{}{}
        for _, label := range c.cfg.Labels {
                key, val, _ := strings.Cut(label, "=")
                if !strings.HasPrefix(key, netlabel.DriverPrefix+"."+ntype) {
                        continue
                }

                cfg[key] = val
        }

        // Merge in the existing config for this driver.
        for k, v := range c.cfg.DriverConfig(ntype) {
                cfg[k] = v
        }

        return cfg
}

// ID returns the controller's unique identity.
func (c *Controller) ID() string {
        return c.id
}

// BuiltinDrivers returns the list of builtin network drivers.
func (c *Controller) BuiltinDrivers() []string {
        drivers := []string{}
        c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
                if driver.IsBuiltIn() {
                        drivers = append(drivers, name)
                }
                return false
        })
        return drivers
}

// BuiltinIPAMDrivers returns the list of builtin ipam drivers.
func (c *Controller) BuiltinIPAMDrivers() []string {
        drivers := []string{}
        c.ipamRegistry.WalkIPAMs(func(name string, driver ipamapi.Ipam, _ *ipamapi.Capability) bool {
                if driver.IsBuiltIn() {
                        drivers = append(drivers, name)
                }
                return false
        })
        return drivers
}

func (c *Controller) processNodeDiscovery(nodes []net.IP, add bool) {
        c.drvRegistry.WalkDrivers(func(name string, driver driverapi.Driver, capability driverapi.Capability) bool {
                if d, ok := driver.(discoverapi.Discover); ok {
                        c.pushNodeDiscovery(d, capability, nodes, add)
                }
                return false
        })
}

func (c *Controller) pushNodeDiscovery(d discoverapi.Discover, capability driverapi.Capability, nodes []net.IP, add bool) {
        var self net.IP
        // try swarm-mode config
        if agent := c.getAgent(); agent != nil {
                self = net.ParseIP(agent.advertiseAddr)
        }

        if d == nil || capability.ConnectivityScope != scope.Global || nodes == nil {
                return
        }

        for _, node := range nodes {
                nodeData := discoverapi.NodeDiscoveryData{Address: node.String(), Self: node.Equal(self)}
                var err error
                if add {
                        err = d.DiscoverNew(discoverapi.NodeDiscovery, nodeData)
                } else {
                        err = d.DiscoverDelete(discoverapi.NodeDiscovery, nodeData)
                }
                if err != nil {
                        log.G(context.TODO()).Debugf("discovery notification error: %v", err)
                }
        }
}

// Config returns the bootup configuration for the controller.
func (c *Controller) Config() config.Config {
        c.mu.Lock()
        defer c.mu.Unlock()
        if c.cfg == nil {
                return config.Config{}
        }
        return *c.cfg
}

func (c *Controller) isManager() bool {
        c.mu.Lock()
        defer c.mu.Unlock()
        if c.cfg == nil || c.cfg.ClusterProvider == nil {
                return false
        }
        return c.cfg.ClusterProvider.IsManager()
}

func (c *Controller) isAgent() bool {
        c.mu.Lock()
        defer c.mu.Unlock()
        if c.cfg == nil || c.cfg.ClusterProvider == nil {
                return false
        }
        return c.cfg.ClusterProvider.IsAgent()
}

func (c *Controller) isSwarmNode() bool {
        return c.isManager() || c.isAgent()
}

func (c *Controller) GetPluginGetter() plugingetter.PluginGetter {
        return c.cfg.PluginGetter
}

func (c *Controller) RegisterDriver(networkType string, driver driverapi.Driver, capability driverapi.Capability) error {
        if d, ok := driver.(discoverapi.Discover); ok {
                c.agentDriverNotify(d)
        }
        return nil
}

// XXX  This should be made driver agnostic.  See comment below.
const overlayDSROptionString = "dsr"

// NewNetwork creates a new network of the specified network type. The options
// are network specific and modeled in a generic way.
func (c *Controller) NewNetwork(ctx context.Context, networkType, name string, id string, options ...NetworkOption) (_ *Network, retErr error) {
        if id != "" {
                c.networkLocker.Lock(id)
                defer c.networkLocker.Unlock(id) //nolint:errcheck

                if _, err := c.NetworkByID(id); err == nil {
                        return nil, NetworkNameError(id)
                }
        }

        if strings.TrimSpace(name) == "" {
                return nil, types.InvalidParameterErrorf("invalid name: name is empty")
        }

        // Make sure two concurrent calls to this method won't create conflicting
        // networks, otherwise libnetwork will end up in an invalid state.
        if name != "" {
                c.networkLocker.Lock(name)
                defer c.networkLocker.Unlock(name)

                if _, err := c.NetworkByName(name); err == nil {
                        return nil, NetworkNameError(name)
                }
        }

        if id == "" {
                id = stringid.GenerateRandomID()
        }

        defaultIpam := defaultIpamForNetworkType(networkType)
        // Construct the network object
        nw := &Network{
                name:             name,
                networkType:      networkType,
                generic:          map[string]interface{}{netlabel.GenericData: make(map[string]string)},
                ipamType:         defaultIpam,
                enableIPv4:       true,
                id:               id,
                created:          time.Now(),
                ctrlr:            c,
                persist:          true,
                drvOnce:          &sync.Once{},
                loadBalancerMode: loadBalancerModeDefault,
        }

        nw.processOptions(options...)
        if err := nw.validateConfiguration(); err != nil {
                return nil, err
        }

        // These variables must be defined here, as declaration would otherwise
        // be skipped by the "goto addToStore"
        var (
                caps driverapi.Capability
                err  error
        )

        // Reset network types, force local scope and skip allocation and
        // plumbing for configuration networks. Reset of the config-only
        // network drivers is needed so that this special network is not
        // usable by old engine versions.
        if nw.configOnly {
                nw.scope = scope.Local
                nw.networkType = "null"
                goto addToStore
        }

        _, caps, err = nw.resolveDriver(nw.networkType, true)
        if err != nil {
                return nil, err
        }

        if nw.scope == scope.Local && caps.DataScope == scope.Global {
                return nil, types.ForbiddenErrorf("cannot downgrade network scope for %s networks", networkType)
        }
        if nw.ingress && caps.DataScope != scope.Global {
                return nil, types.ForbiddenErrorf("Ingress network can only be global scope network")
        }

        // From this point on, we need the network specific configuration,
        // which may come from a configuration-only network
        if nw.configFrom != "" {
                configNetwork, err := c.getConfigNetwork(nw.configFrom)
                if err != nil {
                        return nil, types.NotFoundErrorf("configuration network %q does not exist", nw.configFrom)
                }
                if err := configNetwork.applyConfigurationTo(nw); err != nil {
                        return nil, types.InternalErrorf("Failed to apply configuration: %v", err)
                }
        }

        // At this point the network scope is still unknown if not set by user
        if (caps.DataScope == scope.Global || nw.scope == scope.Swarm) &&
                c.isSwarmNode() && !nw.dynamic {
                if c.isManager() {
                        if !nw.enableIPv4 {
                                return nil, types.InvalidParameterErrorf("IPv4 cannot be disabled in a Swarm scoped network")
                        }
                        // For non-distributed controlled environment, globalscoped non-dynamic networks are redirected to Manager
                        return nil, ManagerRedirectError(name)
                }
                return nil, types.ForbiddenErrorf("Cannot create a multi-host network from a worker node. Please create the network from a manager node.")
        }

        if nw.scope == scope.Swarm && !c.isSwarmNode() {
                return nil, types.ForbiddenErrorf("cannot create a swarm scoped network when swarm is not active")
        }

        // Make sure we have a driver available for this network type
        // before we allocate anything.
        if d, err := nw.driver(true); err != nil {
                return nil, err
        } else if gac, ok := d.(driverapi.GwAllocChecker); ok {
                // Give the driver a chance to say it doesn't need a gateway IP address.
                nw.skipGwAllocIPv4, nw.skipGwAllocIPv6, err = gac.GetSkipGwAlloc(nw.generic)
                if err != nil {
                        return nil, err
                }
        }

        if err := nw.ipamAllocate(); err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        nw.ipamRelease()
                }
        }()

        // Note from thaJeztah to future code visitors, or "future self".
        //
        // This code was previously assigning the error to the global "err"
        // variable (before it was renamed to "retErr"), but in case of a
        // "MaskableError" did not *return* the error:
        // https://github.com/moby/moby/blob/b325dcbff60a04cedbe40eb627465fc7379d05bf/libnetwork/controller.go#L566-L573
        //
        // Depending on code paths further down, that meant that this error
        // was either overwritten by other errors (and thus not handled in
        // defer statements) or handled (if no other code was overwriting it.
        //
        // I suspect this was a bug (but possible without effect), but it could
        // have been intentional. This logic is confusing at least, and even
        // more so combined with the handling in defer statements that check for
        // both the "err" return AND "skipCfgEpCount":
        // https://github.com/moby/moby/blob/b325dcbff60a04cedbe40eb627465fc7379d05bf/libnetwork/controller.go#L586-L602
        //
        // To save future visitors some time to dig up history:
        //
        // - config-only networks were added in 25082206df465d1c11dd1276a65b4a1dc701bd43
        // - the special error-handling and "skipCfgEpcoung" was added in ddd22a819867faa0cd7d12b0c3fad1099ac3eb26
        // - and updated in 87b082f3659f9ec245ab15d781e6bfffced0af83 to don't use string-matching
        //
        // To cut a long story short: if this broke anything, you know who to blame :)
        if err := c.addNetwork(ctx, nw); err != nil {
                if _, ok := err.(types.MaskableError); !ok {
                        return nil, err
                }
        }
        defer func() {
                if retErr != nil {
                        if err := nw.deleteNetwork(); err != nil {
                                log.G(ctx).Warnf("couldn't roll back driver network on network %s creation failure: %v", nw.name, retErr)
                        }
                }
        }()

        // XXX If the driver type is "overlay" check the options for DSR
        // being set.  If so, set the network's load balancing mode to DSR.
        // This should really be done in a network option, but due to
        // time pressure to get this in without adding changes to moby,
        // swarm and CLI, it is being implemented as a driver-specific
        // option.  Unfortunately, drivers can't influence the core
        // "libnetwork.Network" data type.  Hence we need this hack code
        // to implement in this manner.
        if gval, ok := nw.generic[netlabel.GenericData]; ok && nw.networkType == "overlay" {
                optMap := gval.(map[string]string)
                if _, ok := optMap[overlayDSROptionString]; ok {
                        nw.loadBalancerMode = loadBalancerModeDSR
                }
        }

addToStore:
        // First store the endpoint count, then the network. To avoid to
        // end up with a datastore containing a network and not an epCnt,
        // in case of an ungraceful shutdown during this function call.
        //
        // TODO(robmry) - remove this once downgrade past 28.1.0 is no longer supported.
        // The endpoint count is no longer used, it's created in the store to make
        // downgrade work, versions older than 28.1.0 expect to read it and error if they
        // can't. The stored count is not maintained, so the downgraded version will
        // always find it's zero (which is usually correct because the daemon had
        // stopped), but older daemons fix it on startup anyway.
        epCnt := &endpointCnt{n: nw}
        if err := c.updateToStore(ctx, epCnt); err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        if err := c.deleteFromStore(epCnt); err != nil {
                                log.G(ctx).Warnf("could not rollback from store, epCnt %v on failure (%v): %v", epCnt, retErr, err)
                        }
                }
        }()

        if err := c.storeNetwork(ctx, nw); err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        if err := c.deleteStoredNetwork(nw); err != nil {
                                log.G(ctx).Warnf("could not rollback from store, network %v on failure (%v): %v", nw, retErr, err)
                        }
                }
        }()

        if nw.configOnly {
                return nw, nil
        }

        joinCluster(nw)
        defer func() {
                if retErr != nil {
                        nw.cancelDriverWatches()
                        if err := nw.leaveCluster(); err != nil {
                                log.G(ctx).Warnf("Failed to leave agent cluster on network %s on failure (%v): %v", nw.name, retErr, err)
                        }
                }
        }()

        if nw.hasLoadBalancerEndpoint() {
                if err := nw.createLoadBalancerSandbox(); err != nil {
                        return nil, err
                }
        }

        return nw, nil
}

var joinCluster NetworkWalker = func(nw *Network) bool {
        if nw.configOnly {
                return false
        }
        if err := nw.joinCluster(); err != nil {
                log.G(context.TODO()).Errorf("Failed to join network %s (%s) into agent cluster: %v", nw.Name(), nw.ID(), err)
        }
        nw.addDriverWatches()
        return false
}

func (c *Controller) reservePools() {
        networks, err := c.getNetworks()
        if err != nil {
                log.G(context.TODO()).Warnf("Could not retrieve networks from local store during ipam allocation for existing networks: %v", err)
                return
        }

        for _, n := range networks {
                if n.configOnly {
                        continue
                }
                if !doReplayPoolReserve(n) {
                        continue
                }
                // Construct pseudo configs for the auto IP case
                autoIPv4 := (len(n.ipamV4Config) == 0 || (len(n.ipamV4Config) == 1 && n.ipamV4Config[0].PreferredPool == "")) && len(n.ipamV4Info) > 0
                autoIPv6 := (len(n.ipamV6Config) == 0 || (len(n.ipamV6Config) == 1 && n.ipamV6Config[0].PreferredPool == "")) && len(n.ipamV6Info) > 0
                if n.enableIPv4 && autoIPv4 {
                        n.ipamV4Config = []*IpamConf{{PreferredPool: n.ipamV4Info[0].Pool.String()}}
                }
                if n.enableIPv6 && autoIPv6 {
                        n.ipamV6Config = []*IpamConf{{PreferredPool: n.ipamV6Info[0].Pool.String()}}
                }
                // Account current network gateways
                if n.enableIPv4 {
                        for i, cfg := range n.ipamV4Config {
                                if cfg.Gateway == "" && n.ipamV4Info[i].Gateway != nil {
                                        cfg.Gateway = n.ipamV4Info[i].Gateway.IP.String()
                                }
                        }
                }
                if n.enableIPv6 {
                        for i, cfg := range n.ipamV6Config {
                                if cfg.Gateway == "" && n.ipamV6Info[i].Gateway != nil {
                                        cfg.Gateway = n.ipamV6Info[i].Gateway.IP.String()
                                }
                        }
                }
                // Reserve pools
                if err := n.ipamAllocate(); err != nil {
                        log.G(context.TODO()).Warnf("Failed to allocate ipam pool(s) for network %q (%s): %v", n.Name(), n.ID(), err)
                }
                // Reserve existing endpoints' addresses
                ipam, _, err := n.getController().getIPAMDriver(n.ipamType)
                if err != nil {
                        log.G(context.TODO()).Warnf("Failed to retrieve ipam driver for network %q (%s) during address reservation", n.Name(), n.ID())
                        continue
                }
                epl, err := n.getEndpointsFromStore()
                if err != nil {
                        log.G(context.TODO()).Warnf("Failed to retrieve list of current endpoints on network %q (%s)", n.Name(), n.ID())
                        continue
                }
                for _, ep := range epl {
                        if ep.Iface() == nil {
                                log.G(context.TODO()).Warnf("endpoint interface is empty for %q (%s)", ep.Name(), ep.ID())
                                continue
                        }
                        if err := ep.assignAddress(ipam, ep.Iface().Address() != nil, ep.Iface().AddressIPv6() != nil); err != nil {
                                log.G(context.TODO()).Warnf("Failed to reserve current address for endpoint %q (%s) on network %q (%s)",
                                        ep.Name(), ep.ID(), n.Name(), n.ID())
                        }
                }
        }
}

func doReplayPoolReserve(n *Network) bool {
        _, caps, err := n.getController().getIPAMDriver(n.ipamType)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to retrieve ipam driver for network %q (%s): %v", n.Name(), n.ID(), err)
                return false
        }
        return caps.RequiresRequestReplay
}

func (c *Controller) addNetwork(ctx context.Context, n *Network) error {
        d, err := n.driver(true)
        if err != nil {
                return err
        }

        // Create the network
        if err := d.CreateNetwork(ctx, n.id, n.generic, n, n.getIPData(4), n.getIPData(6)); err != nil {
                return err
        }

        n.startResolver()

        return nil
}

// Networks returns the list of Network(s) managed by this controller.
func (c *Controller) Networks(ctx context.Context) []*Network {
        var list []*Network

        for _, n := range c.getNetworksFromStore(ctx) {
                if n.inDelete {
                        continue
                }
                list = append(list, n)
        }

        return list
}

// WalkNetworks uses the provided function to walk the Network(s) managed by this controller.
func (c *Controller) WalkNetworks(walker NetworkWalker) {
        for _, n := range c.Networks(context.TODO()) {
                if walker(n) {
                        return
                }
        }
}

// NetworkByName returns the Network which has the passed name.
// If not found, the error [ErrNoSuchNetwork] is returned.
func (c *Controller) NetworkByName(name string) (*Network, error) {
        if name == "" {
                return nil, types.InvalidParameterErrorf("invalid name: name is empty")
        }
        var n *Network

        c.WalkNetworks(func(current *Network) bool {
                if current.Name() == name {
                        n = current
                        return true
                }
                return false
        })

        if n == nil {
                return nil, ErrNoSuchNetwork(name)
        }

        return n, nil
}

// NetworkByID returns the Network which has the passed id.
// If not found, the error [ErrNoSuchNetwork] is returned.
func (c *Controller) NetworkByID(id string) (*Network, error) {
        if id == "" {
                return nil, types.InvalidParameterErrorf("invalid id: id is empty")
        }
        return c.getNetworkFromStore(id)
}

// NewSandbox creates a new sandbox for containerID.
func (c *Controller) NewSandbox(ctx context.Context, containerID string, options ...SandboxOption) (_ *Sandbox, retErr error) {
        if containerID == "" {
                return nil, types.InvalidParameterErrorf("invalid container ID")
        }

        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.Controller.NewSandbox")
        defer span.End()

        var sb *Sandbox
        c.mu.Lock()
        for _, s := range c.sandboxes {
                if s.containerID == containerID {
                        // If not a stub, then we already have a complete sandbox.
                        if !s.isStub {
                                sbID := s.ID()
                                c.mu.Unlock()
                                return nil, types.ForbiddenErrorf("container %s is already present in sandbox %s", containerID, sbID)
                        }

                        // We already have a stub sandbox from the
                        // store. Make use of it so that we don't lose
                        // the endpoints from store but reset the
                        // isStub flag.
                        sb = s
                        sb.isStub = false
                        break
                }
        }
        c.mu.Unlock()

        // Create sandbox and process options first. Key generation depends on an option
        if sb == nil {
                // TODO(thaJeztah): given that a "containerID" must be unique in the list of sandboxes, is there any reason we're not using containerID as sandbox ID on non-Windows?
                sandboxID := containerID
                if runtime.GOOS != "windows" {
                        sandboxID = stringid.GenerateRandomID()
                }
                sb = &Sandbox{
                        id:                 sandboxID,
                        containerID:        containerID,
                        endpoints:          []*Endpoint{},
                        epPriority:         map[string]int{},
                        populatedEndpoints: map[string]struct{}{},
                        config:             containerConfig{},
                        controller:         c,
                        extDNS:             []extDNSEntry{},
                }
        }

        sb.processOptions(options...)

        c.mu.Lock()
        if sb.ingress && c.ingressSandbox != nil {
                c.mu.Unlock()
                return nil, types.ForbiddenErrorf("ingress sandbox already present")
        }

        if sb.ingress {
                c.ingressSandbox = sb
                sb.config.hostsPath = filepath.Join(c.cfg.DataDir, "hosts")
                sb.config.resolvConfPath = filepath.Join(c.cfg.DataDir, "resolv.conf")
                sb.id = "ingress_sbox"
        } else if sb.loadBalancerNID != "" {
                sb.id = "lb_" + sb.loadBalancerNID
        }
        c.mu.Unlock()

        defer func() {
                if retErr != nil {
                        c.mu.Lock()
                        if sb.ingress {
                                c.ingressSandbox = nil
                        }
                        c.mu.Unlock()
                }
        }()

        if err := sb.setupResolutionFiles(ctx); err != nil {
                return nil, err
        }
        if err := c.setupOSLSandbox(sb); err != nil {
                return nil, err
        }

        c.mu.Lock()
        c.sandboxes[sb.id] = sb
        c.mu.Unlock()
        defer func() {
                if retErr != nil {
                        c.mu.Lock()
                        delete(c.sandboxes, sb.id)
                        c.mu.Unlock()
                }
        }()

        if err := sb.storeUpdate(ctx); err != nil {
                return nil, fmt.Errorf("failed to update the store state of sandbox: %v", err)
        }

        return sb, nil
}

// GetSandbox returns the Sandbox which has the passed id.
//
// It returns an [ErrInvalidID] when passing an invalid ID, or an
// [types.NotFoundError] if no Sandbox was found for the container.
func (c *Controller) GetSandbox(containerID string) (*Sandbox, error) {
        if containerID == "" {
                return nil, types.InvalidParameterErrorf("invalid id: id is empty")
        }
        c.mu.Lock()
        defer c.mu.Unlock()
        if runtime.GOOS == "windows" {
                // fast-path for Windows, which uses the container ID as sandbox ID.
                if sb := c.sandboxes[containerID]; sb != nil && !sb.isStub {
                        return sb, nil
                }
        } else {
                for _, sb := range c.sandboxes {
                        if sb.containerID == containerID && !sb.isStub {
                                return sb, nil
                        }
                }
        }

        return nil, types.NotFoundErrorf("network sandbox for container %s not found", containerID)
}

// SandboxByID returns the Sandbox which has the passed id.
// If not found, a [types.NotFoundError] is returned.
func (c *Controller) SandboxByID(id string) (*Sandbox, error) {
        if id == "" {
                return nil, types.InvalidParameterErrorf("invalid id: id is empty")
        }
        c.mu.Lock()
        s, ok := c.sandboxes[id]
        c.mu.Unlock()
        if !ok {
                return nil, types.NotFoundErrorf("sandbox %s not found", id)
        }
        return s, nil
}

// SandboxDestroy destroys a sandbox given a container ID.
func (c *Controller) SandboxDestroy(ctx context.Context, id string) error {
        var sb *Sandbox
        c.mu.Lock()
        for _, s := range c.sandboxes {
                if s.containerID == id {
                        sb = s
                        break
                }
        }
        c.mu.Unlock()

        // It is not an error if sandbox is not available
        if sb == nil {
                return nil
        }

        return sb.Delete(ctx)
}

func (c *Controller) loadDriver(networkType string) error {
        var err error

        if pg := c.GetPluginGetter(); pg != nil {
                _, err = pg.Get(networkType, driverapi.NetworkPluginEndpointType, plugingetter.Lookup)
        } else {
                _, err = plugins.Get(networkType, driverapi.NetworkPluginEndpointType)
        }

        if err != nil {
                if errors.Is(err, plugins.ErrNotFound) {
                        return types.NotFoundErrorf("%v", err)
                }
                return err
        }

        return nil
}

func (c *Controller) loadIPAMDriver(name string) error {
        var err error

        if pg := c.GetPluginGetter(); pg != nil {
                _, err = pg.Get(name, ipamapi.PluginEndpointType, plugingetter.Lookup)
        } else {
                _, err = plugins.Get(name, ipamapi.PluginEndpointType)
        }

        if err != nil {
                if errors.Is(err, plugins.ErrNotFound) {
                        return types.NotFoundErrorf("%v", err)
                }
                return err
        }

        return nil
}

func (c *Controller) getIPAMDriver(name string) (ipamapi.Ipam, *ipamapi.Capability, error) {
        id, caps := c.ipamRegistry.IPAM(name)
        if id == nil {
                // Might be a plugin name. Try loading it
                if err := c.loadIPAMDriver(name); err != nil {
                        return nil, nil, err
                }

                // Now that we resolved the plugin, try again looking up the registry
                id, caps = c.ipamRegistry.IPAM(name)
                if id == nil {
                        return nil, nil, types.InvalidParameterErrorf("invalid ipam driver: %q", name)
                }
        }

        return id, caps, nil
}

// Stop stops the network controller.
func (c *Controller) Stop() {
        c.store.Close()
        c.stopExternalKeyListener()
}

// StartDiagnostic starts the network diagnostic server listening on port.
func (c *Controller) StartDiagnostic(port int) {
        c.diagnosticServer.Enable("127.0.0.1", port)
}

// StopDiagnostic stops the network diagnostic server.
func (c *Controller) StopDiagnostic() {
        c.diagnosticServer.Shutdown()
}

// IsDiagnosticEnabled returns true if the diagnostic server is running.
func (c *Controller) IsDiagnosticEnabled() bool {
        return c.diagnosticServer.Enabled()
}

package libnetwork

import (
        "context"
        "fmt"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
        "github.com/docker/docker/daemon/libnetwork/iptables"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/options"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/moby/moby/api/types/system"
)

// FirewallBackend returns the name of the firewall backend for "docker info".
func (c *Controller) FirewallBackend() *system.FirewallInfo {
        var info system.FirewallInfo
        info.Driver = "iptables"
        if nftables.Enabled() {
                info.Driver = "nftables"
        }
        if iptables.UsingFirewalld() {
                info.Driver += "+firewalld"
                if reloadedAt := iptables.FirewalldReloadedAt(); !reloadedAt.IsZero() {
                        info.Info = [][2]string{{"ReloadedAt", reloadedAt.Format(time.RFC3339)}}
                }
        }
        return &info
}

// enabledIptablesVersions returns the iptables versions that are enabled
// for the controller.
func (c *Controller) enabledIptablesVersions() []iptables.IPVersion {
        c.mu.Lock()
        defer c.mu.Unlock()
        if c.cfg == nil {
                return nil
        }
        // parse map cfg["bridge"]["generic"]["EnableIPTable"]
        cfgBridge := c.cfg.DriverConfig("bridge")
        cfgGeneric, ok := cfgBridge[netlabel.GenericData].(options.Generic)
        if !ok {
                return nil
        }

        var versions []iptables.IPVersion
        if enabled, ok := cfgGeneric["EnableIPTables"].(bool); enabled || !ok {
                // iptables is enabled unless user explicitly disabled it
                versions = append(versions, iptables.IPv4)
        }
        if enabled, _ := cfgGeneric["EnableIP6Tables"].(bool); enabled {
                versions = append(versions, iptables.IPv6)
        }
        return versions
}

// getDefaultOSLSandbox returns the controller's default [osl.Sandbox]. It
// creates the sandbox if it does not yet exist.
func (c *Controller) getDefaultOSLSandbox(key string) (*osl.Namespace, error) {
        var err error
        c.defOsSboxOnce.Do(func() {
                c.defOsSbox, err = osl.NewSandbox(key, false, false)
        })

        if err != nil {
                c.defOsSboxOnce = sync.Once{}
                return nil, fmt.Errorf("failed to create default sandbox: %v", err)
        }
        return c.defOsSbox, nil
}

// setupOSLSandbox sets the sandbox [osl.Sandbox], and applies operating-
// specific configuration.
//
// Depending on the Sandbox settings, it may either use the Controller's
// default sandbox, or configure a new one.
func (c *Controller) setupOSLSandbox(sb *Sandbox) error {
        if sb.config.useDefaultSandBox {
                defSB, err := c.getDefaultOSLSandbox(sb.Key())
                if err != nil {
                        return err
                }
                sb.osSbox = defSB
        }

        if sb.osSbox == nil && !sb.config.useExternalKey {
                newSB, err := osl.NewSandbox(sb.Key(), !sb.config.useDefaultSandBox, false)
                if err != nil {
                        return fmt.Errorf("failed to create new osl sandbox: %v", err)
                }
                sb.osSbox = newSB
        }

        if sb.osSbox != nil {
                // Apply operating specific knobs on the load balancer sandbox
                err := sb.osSbox.InvokeFunc(func() {
                        sb.osSbox.ApplyOSTweaks(sb.oslTypes)
                })
                if err != nil {
                        log.G(context.TODO()).Errorf("Failed to apply performance tuning sysctls to the sandbox: %v", err)
                }
                // Keep this just so performance is not changed
                sb.osSbox.ApplyOSTweaks(sb.oslTypes)
        }
        return nil
}

package datastore

import (
        "errors"
        "fmt"
        "sync"

        store "github.com/docker/docker/daemon/libnetwork/internal/kvstore"
)

type kvMap map[string]KVObject

type cache struct {
        mu  sync.Mutex
        kmm map[string]kvMap
        ds  store.Store
}

func newCache(ds store.Store) *cache {
        return &cache{kmm: make(map[string]kvMap), ds: ds}
}

func (c *cache) kmap(kvObject KVObject) (kvMap, error) {
        var err error

        c.mu.Lock()
        keyPrefix := Key(kvObject.KeyPrefix()...)
        kmap, ok := c.kmm[keyPrefix]
        c.mu.Unlock()

        if ok {
                return kmap, nil
        }

        kmap = kvMap{}

        kvList, err := c.ds.List(keyPrefix)
        if err != nil {
                if errors.Is(err, store.ErrKeyNotFound) {
                        // If the store doesn't have anything then there is nothing to
                        // populate in the cache. Just bail out.
                        goto out
                }

                return nil, fmt.Errorf("error while populating kmap: %v", err)
        }

        for _, kvPair := range kvList {
                // Ignore empty kvPair values
                if len(kvPair.Value) == 0 {
                        continue
                }

                dstO := kvObject.New()
                err = dstO.SetValue(kvPair.Value)
                if err != nil {
                        return nil, err
                }

                // Make sure the object has a correct view of the DB index in
                // case we need to modify it and update the DB.
                dstO.SetIndex(kvPair.LastIndex)

                kmap[Key(dstO.Key()...)] = dstO
        }

out:
        // There may multiple go routines racing to fill the
        // cache. The one which places the kmap in c.kmm first
        // wins. The others should just use what the first populated.
        c.mu.Lock()
        kmapNew, ok := c.kmm[keyPrefix]
        if ok {
                c.mu.Unlock()
                return kmapNew, nil
        }

        c.kmm[keyPrefix] = kmap
        c.mu.Unlock()

        return kmap, nil
}

func (c *cache) add(kvObject KVObject, atomic bool) error {
        kmap, err := c.kmap(kvObject)
        if err != nil {
                return err
        }

        c.mu.Lock()
        // If atomic is true, cache needs to maintain its own index
        // for atomicity and the add needs to be atomic.
        if atomic {
                if prev, ok := kmap[Key(kvObject.Key()...)]; ok {
                        if prev.Index() != kvObject.Index() {
                                c.mu.Unlock()
                                return ErrKeyModified
                        }
                }

                // Increment index
                index := kvObject.Index()
                index++
                kvObject.SetIndex(index)
        }

        kmap[Key(kvObject.Key()...)] = kvObject
        c.mu.Unlock()
        return nil
}

func (c *cache) del(kvObject KVObject, atomic bool) error {
        kmap, err := c.kmap(kvObject)
        if err != nil {
                return err
        }

        c.mu.Lock()
        // If atomic is true, cache needs to maintain its own index
        // for atomicity and del needs to be atomic.
        if atomic {
                if prev, ok := kmap[Key(kvObject.Key()...)]; ok {
                        if prev.Index() != kvObject.Index() {
                                c.mu.Unlock()
                                return ErrKeyModified
                        }
                }
        }

        delete(kmap, Key(kvObject.Key()...))
        c.mu.Unlock()
        return nil
}

func (c *cache) get(kvObject KVObject) error {
        kmap, err := c.kmap(kvObject)
        if err != nil {
                return err
        }

        c.mu.Lock()
        defer c.mu.Unlock()

        o, ok := kmap[Key(kvObject.Key()...)]
        if !ok {
                return ErrKeyNotFound
        }

        return o.CopyTo(kvObject)
}

func (c *cache) list(kvObject KVObject) ([]KVObject, error) {
        kmap, err := c.kmap(kvObject)
        if err != nil {
                return nil, err
        }

        c.mu.Lock()
        defer c.mu.Unlock()

        var kvol []KVObject
        for _, v := range kmap {
                kvol = append(kvol, v)
        }

        return kvol, nil
}

package datastore

import (
        "errors"
        "path"
        "strings"
        "sync"

        store "github.com/docker/docker/daemon/libnetwork/internal/kvstore"
        "github.com/docker/docker/daemon/libnetwork/internal/kvstore/boltdb"
        "github.com/docker/docker/daemon/libnetwork/types"
)

// ErrKeyModified is raised for an atomic update when the update is working on a stale state
var (
        ErrKeyModified = store.ErrKeyModified
        ErrKeyNotFound = store.ErrKeyNotFound
)

type Store struct {
        mu    sync.Mutex
        store store.Store
        cache *cache
}

// KVObject is Key/Value interface used by objects to be part of the Store.
type KVObject interface {
        // Key method lets an object provide the Key to be used in KV Store
        Key() []string
        // KeyPrefix method lets an object return immediate parent key that can be used for tree walk
        KeyPrefix() []string
        // Value method lets an object marshal its content to be stored in the KV store
        Value() []byte
        // SetValue is used by the datastore to set the object's value when loaded from the data store.
        SetValue([]byte) error
        // Index method returns the latest DB Index as seen by the object
        Index() uint64
        // SetIndex method allows the datastore to store the latest DB Index into the object
        SetIndex(uint64)
        // Exists returns true if the object exists in the datastore, false if it hasn't been stored yet.
        // When SetIndex() is called, the object has been stored.
        Exists() bool
        // Skip provides a way for a KV Object to avoid persisting it in the KV Store
        Skip() bool
        // New returns a new object which is created based on the
        // source object
        New() KVObject
        // CopyTo deep copies the contents of the implementing object
        // to the passed destination object
        CopyTo(KVObject) error
}

const (
        // NetworkKeyPrefix is the prefix for network key in the kv store
        NetworkKeyPrefix = "network"
        // EndpointKeyPrefix is the prefix for endpoint key in the kv store
        EndpointKeyPrefix = "endpoint"
)

var (
        defaultRootChain = []string{"docker", "network", "v1.0"}
        rootChain        = defaultRootChain
)

const DefaultBucket = "libnetwork"

// Key provides convenient method to create a Key
func Key(key ...string) string {
        var b strings.Builder
        for _, parts := range [][]string{rootChain, key} {
                for _, part := range parts {
                        b.WriteString(part)
                        b.WriteString("/")
                }
        }
        return b.String()
}

// New creates a new Store instance.
func New(dir, bucket string) (*Store, error) {
        if dir == "" {
                return nil, errors.New("empty dir")
        }
        if bucket == "" {
                return nil, errors.New("empty bucket")
        }

        s, err := boltdb.New(path.Join(dir, "local-kv.db"), bucket)
        if err != nil {
                return nil, err
        }

        return &Store{store: s, cache: newCache(s)}, nil
}

// Close closes the data store.
func (ds *Store) Close() {
        ds.store.Close()
}

// PutObjectAtomic provides an atomic add and update operation for a Record.
func (ds *Store) PutObjectAtomic(kvObject KVObject) error {
        ds.mu.Lock()
        defer ds.mu.Unlock()

        if kvObject == nil {
                return types.InvalidParameterErrorf("invalid KV Object: nil")
        }

        kvObjValue := kvObject.Value()

        if kvObjValue == nil {
                return types.InvalidParameterErrorf("invalid KV Object with a nil Value for key %s", Key(kvObject.Key()...))
        }

        if !kvObject.Skip() {
                var previous *store.KVPair
                if kvObject.Exists() {
                        previous = &store.KVPair{Key: Key(kvObject.Key()...), LastIndex: kvObject.Index()}
                }

                pair, err := ds.store.AtomicPut(Key(kvObject.Key()...), kvObjValue, previous)
                if err != nil {
                        if errors.Is(err, store.ErrKeyExists) {
                                return ErrKeyModified
                        }
                        return err
                }

                kvObject.SetIndex(pair.LastIndex)
        }

        // If persistent store is skipped, sequencing needs to
        // happen in cache.
        return ds.cache.add(kvObject, kvObject.Skip())
}

// GetObject gets data from the store and unmarshals to the specified object.
func (ds *Store) GetObject(o KVObject) error {
        ds.mu.Lock()
        defer ds.mu.Unlock()

        return ds.cache.get(o)
}

func (ds *Store) ensureParent(parent string) error {
        exists, err := ds.store.Exists(parent)
        if err != nil {
                return err
        }
        if exists {
                return nil
        }
        return ds.store.Put(parent, []byte{})
}

// List returns of a list of KVObjects belonging to the parent key. The caller
// must pass a KVObject of the same type as the objects that need to be listed.
func (ds *Store) List(kvObject KVObject) ([]KVObject, error) {
        ds.mu.Lock()
        defer ds.mu.Unlock()

        return ds.cache.list(kvObject)
}

func (ds *Store) iterateKVPairsFromStore(key string, ctor KVObject, callback func(string, KVObject)) error {
        // Make sure the parent key exists
        if err := ds.ensureParent(key); err != nil {
                return err
        }

        kvList, err := ds.store.List(key)
        if err != nil {
                return err
        }

        for _, kvPair := range kvList {
                if len(kvPair.Value) == 0 {
                        continue
                }

                dstO := ctor.New()
                if err := dstO.SetValue(kvPair.Value); err != nil {
                        return err
                }

                // Make sure the object has a correct view of the DB index in
                // case we need to modify it and update the DB.
                dstO.SetIndex(kvPair.LastIndex)
                callback(kvPair.Key, dstO)
        }

        return nil
}

// Map returns a Map of KVObjects.
func (ds *Store) Map(key string, kvObject KVObject) (map[string]KVObject, error) {
        ds.mu.Lock()
        defer ds.mu.Unlock()

        results := map[string]KVObject{}
        err := ds.iterateKVPairsFromStore(key, kvObject, func(key string, val KVObject) {
                // Trim the leading & trailing "/" to make it consistent across all stores
                results[strings.Trim(key, "/")] = val
        })
        if err != nil {
                return nil, err
        }
        return results, nil
}

// DeleteObject deletes a kvObject from the on-disk DB and the in-memory cache.
// Unlike DeleteObjectAtomic, it doesn't check the optimistic lock of the
// passed kvObject.
func (ds *Store) DeleteObject(kvObject KVObject) error {
        ds.mu.Lock()
        defer ds.mu.Unlock()

        if kvObject == nil {
                return types.InvalidParameterErrorf("invalid KV Object: nil")
        }

        if !kvObject.Skip() {
                if err := ds.store.Delete(Key(kvObject.Key()...)); err != nil {
                        return err
                }
        }

        // cleanup the cache only if AtomicDelete went through successfully
        // If persistent store is skipped, sequencing needs to
        // happen in cache.
        return ds.cache.del(kvObject, false)
}

// DeleteObjectAtomic performs atomic delete on a record.
func (ds *Store) DeleteObjectAtomic(kvObject KVObject) error {
        ds.mu.Lock()
        defer ds.mu.Unlock()

        if kvObject == nil {
                return types.InvalidParameterErrorf("invalid KV Object: nil")
        }

        previous := &store.KVPair{Key: Key(kvObject.Key()...), LastIndex: kvObject.Index()}

        if !kvObject.Skip() {
                if err := ds.store.AtomicDelete(Key(kvObject.Key()...), previous); err != nil {
                        if errors.Is(err, store.ErrKeyExists) {
                                return ErrKeyModified
                        }
                        return err
                }
        }

        // cleanup the cache only if AtomicDelete went through successfully
        // If persistent store is skipped, sequencing needs to
        // happen in cache.
        return ds.cache.del(kvObject, kvObject.Skip())
}

package libnetwork

import (
        "context"
        "fmt"
        "strings"

        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        gwEPlen = 12
)

var procGwNetwork = make(chan (bool), 1)

/*
   libnetwork creates a bridge network "docker_gw_bridge" for providing
   default gateway for the containers if none of the container's endpoints
   have GW set by the driver. ICC is set to false for the GW_bridge network.

   If a driver can't provide external connectivity it can choose to not set
   the GW IP for the endpoint.

   endpoint on the GW_bridge network is managed dynamically by libnetwork.
   ie:
   - its created when an endpoint without GW joins the container
   - its deleted when an endpoint with GW joins the container
*/

func (sb *Sandbox) setupDefaultGW() error {
        // check if the container already has a GW endpoint
        if ep := sb.getEndpointInGWNetwork(); ep != nil {
                return nil
        }

        c := sb.controller

        // Look for default gw network. In case of error (includes not found),
        // retry and create it if needed in a serialized execution.
        n, err := c.NetworkByName(libnGWNetwork)
        if err != nil {
                if n, err = c.defaultGwNetwork(); err != nil {
                        return err
                }
        }

        createOptions := []EndpointOption{}

        var gwName string
        if len(sb.containerID) <= gwEPlen {
                gwName = "gateway_" + sb.containerID
        } else {
                gwName = "gateway_" + sb.id[:gwEPlen]
        }

        sbLabels := sb.Labels()

        if sbLabels[netlabel.PortMap] != nil {
                createOptions = append(createOptions, CreateOptionPortMapping(sbLabels[netlabel.PortMap].([]types.PortBinding)))
        }

        if sbLabels[netlabel.ExposedPorts] != nil {
                createOptions = append(createOptions, CreateOptionExposedPorts(sbLabels[netlabel.ExposedPorts].([]types.TransportPort)))
        }

        epOption := getPlatformOption()
        if epOption != nil {
                createOptions = append(createOptions, epOption)
        }

        newEp, err := n.CreateEndpoint(context.TODO(), gwName, createOptions...)
        if err != nil {
                return fmt.Errorf("container %s: endpoint create on GW Network failed: %v", sb.containerID, err)
        }

        defer func() {
                if err != nil {
                        if err2 := newEp.Delete(context.WithoutCancel(context.TODO()), true); err2 != nil {
                                log.G(context.TODO()).Warnf("Failed to remove gw endpoint for container %s after failing to join the gateway network: %v",
                                        sb.containerID, err2)
                        }
                }
        }()

        if err = newEp.sbJoin(context.TODO(), sb); err != nil {
                return fmt.Errorf("container %s: endpoint join on GW Network failed: %v", sb.containerID, err)
        }

        return nil
}

// If present, detach and remove the endpoint connecting the sandbox to the default gw network.
func (sb *Sandbox) clearDefaultGW() error {
        var ep *Endpoint

        if ep = sb.getEndpointInGWNetwork(); ep == nil {
                return nil
        }
        if err := ep.sbLeave(context.TODO(), sb, false); err != nil {
                return fmt.Errorf("container %s: endpoint leaving GW Network failed: %v", sb.containerID, err)
        }
        if err := ep.Delete(context.TODO(), false); err != nil {
                return fmt.Errorf("container %s: deleting endpoint on GW Network failed: %v", sb.containerID, err)
        }
        return nil
}

// Evaluate whether the sandbox requires a default gateway based
// on the endpoints to which it is connected. It does not account
// for the default gateway network endpoint.

func (sb *Sandbox) needDefaultGW() bool {
        var needGW bool

        for _, ep := range sb.Endpoints() {
                if ep.endpointInGWNetwork() {
                        continue
                }
                if ep.getNetwork().Type() == "null" || ep.getNetwork().Type() == "host" {
                        continue
                }
                if ep.getNetwork().Internal() {
                        continue
                }
                // During stale sandbox cleanup, joinInfo may be nil
                if ep.joinInfo != nil && ep.joinInfo.disableGatewayService {
                        continue
                }
                if len(ep.Gateway()) > 0 {
                        return false
                }
                if len(ep.GatewayIPv6()) > 0 {
                        return false
                }
                for _, r := range ep.StaticRoutes() {
                        if r.Destination != nil && r.Destination.String() == "0.0.0.0/0" {
                                return false
                        }
                }
                needGW = true
        }

        return needGW
}

func (sb *Sandbox) getEndpointInGWNetwork() *Endpoint {
        for _, ep := range sb.Endpoints() {
                if ep.getNetwork().name == libnGWNetwork && strings.HasPrefix(ep.Name(), "gateway_") {
                        return ep
                }
        }
        return nil
}

func (ep *Endpoint) endpointInGWNetwork() bool {
        if ep.getNetwork().name == libnGWNetwork && strings.HasPrefix(ep.Name(), "gateway_") {
                return true
        }
        return false
}

// Looks for the default gw network and creates it if not there.
// Parallel executions are serialized.
func (c *Controller) defaultGwNetwork() (*Network, error) {
        procGwNetwork <- true
        defer func() { <-procGwNetwork }()

        n, err := c.NetworkByName(libnGWNetwork)
        if cerrdefs.IsNotFound(err) {
                n, err = c.createGWNetwork()
        }
        return n, err
}

// getGatewayEndpoint returns the endpoints providing external connectivity to
// the sandbox. If the gateway is dual-stack, ep4 and ep6 will point at the same
// endpoint. If there is no IPv4/IPv6 connectivity, nil pointers will be returned.
func (sb *Sandbox) getGatewayEndpoint() (ep4, ep6 *Endpoint) {
        return selectGatewayEndpoint(sb.Endpoints())
}

// selectGatewayEndpoint is like getGatewayEndpoint, but selects only from
// endpoints.
func selectGatewayEndpoint(endpoints []*Endpoint) (ep4, ep6 *Endpoint) {
        for _, ep := range endpoints {
                if ep.getNetwork().Type() == "null" || ep.getNetwork().Type() == "host" {
                        continue
                }
                gw4, gw6 := ep.hasGatewayOrDefaultRoute()
                if gw4 && gw6 {
                        // The first dual-stack endpoint is the gateway, no need to search further.
                        //
                        // FIXME(robmry) - this means a dual-stack gateway is preferred over single-stack
                        // gateways with higher gateway-priorities. A dual-stack network should probably
                        // be preferred over two single-stack networks, if they all have equal priorities.
                        // It'd probably also be better to use a dual-stack endpoint as the gateway for
                        // a single address family, if there's a higher-priority single-stack gateway for
                        // the other address family. (But, priority is currently a Sandbox property, not
                        // an Endpoint property. So, this function doesn't have access to priorities.)
                        return ep, ep
                }
                if gw4 && ep4 == nil {
                        // Found the best IPv4-only gateway, keep searching for an IPv6 or dual-stack gateway.
                        ep4 = ep
                }
                if gw6 && ep6 == nil {
                        // Found the best IPv6-only gateway, keep searching for an IPv4 or dual-stack gateway.
                        ep6 = ep
                }
        }
        return ep4, ep6
}

package libnetwork

import (
        "context"
        "fmt"
        "strconv"

        "github.com/docker/docker/daemon/libnetwork/drivers/bridge"
        "github.com/docker/docker/internal/otelutil"
        "go.opentelemetry.io/otel/baggage"
)

const libnGWNetwork = "docker_gwbridge"

func getPlatformOption() EndpointOption {
        return nil
}

func (c *Controller) createGWNetwork() (*Network, error) {
        ctx := baggage.ContextWithBaggage(context.TODO(), otelutil.MustNewBaggage(
                otelutil.MustNewMemberRaw(otelutil.TriggerKey, "libnetwork.Controller.createGWNetwork"),
        ))

        n, err := c.NewNetwork(ctx, "bridge", libnGWNetwork, "",
                NetworkOptionDriverOpts(map[string]string{
                        bridge.BridgeName:         libnGWNetwork,
                        bridge.EnableICC:          strconv.FormatBool(false),
                        bridge.EnableIPMasquerade: strconv.FormatBool(true),
                }),
                NetworkOptionEnableIPv4(true),
                NetworkOptionEnableIPv6(false),
        )
        if err != nil {
                return nil, fmt.Errorf("error creating external connectivity network: %v", err)
        }
        return n, err
}

package diagnostic

import (
        "context"
        "encoding/json"
        "fmt"
        "net"
        "net/http"
        "strconv"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/caller"
)

// Server when the debug is enabled exposes a
// This data structure is protected by the Agent mutex so does not require and additional mutex here
type Server struct {
        mu       sync.Mutex
        enable   bool
        srv      *http.Server
        port     int
        mux      *http.ServeMux
        handlers map[string]http.Handler
}

// New creates a new diagnostic server
func New() *Server {
        s := &Server{
                mux:      http.NewServeMux(),
                handlers: make(map[string]http.Handler),
        }
        s.HandleFunc("/", notImplemented)
        s.HandleFunc("/help", s.help)
        s.HandleFunc("/ready", ready)
        return s
}

// Handle registers the handler for the given pattern,
// replacing any existing handler.
func (s *Server) Handle(pattern string, handler http.Handler) {
        s.mu.Lock()
        defer s.mu.Unlock()
        if _, ok := s.handlers[pattern]; !ok {
                // Register a handler on the mux which allows the underlying handler to
                // be dynamically switched out. The http.ServeMux will panic if one
                // attempts to register a handler for the same pattern twice.
                s.mux.HandleFunc(pattern, func(w http.ResponseWriter, r *http.Request) {
                        s.mu.Lock()
                        h := s.handlers[pattern]
                        s.mu.Unlock()
                        h.ServeHTTP(w, r)
                })
        }
        s.handlers[pattern] = handler
}

// HandleFunc registers the handler function for the given pattern,
// replacing any existing handler.
func (s *Server) HandleFunc(pattern string, handler func(http.ResponseWriter, *http.Request)) {
        s.Handle(pattern, http.HandlerFunc(handler))
}

// ServeHTTP this is the method called bu the ListenAndServe, and is needed to allow us to
// use our custom mux
func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
        s.mux.ServeHTTP(w, r)
}

// Enable opens a TCP socket to debug the passed network DB
func (s *Server) Enable(ip string, port int) {
        s.mu.Lock()
        defer s.mu.Unlock()

        s.port = port
        log.G(context.TODO()).WithFields(log.Fields{"port": s.port, "ip": ip}).Warn("Starting network diagnostic server")

        // FIXME(thaJeztah): this check won't allow re-configuring the port on reload.
        if s.enable {
                log.G(context.TODO()).WithFields(log.Fields{"port": s.port, "ip": ip}).Info("Network diagnostic server is already up and running")
                return
        }

        addr := net.JoinHostPort(ip, strconv.Itoa(s.port))
        log.G(context.TODO()).WithFields(log.Fields{"port": s.port, "ip": ip}).Infof("Starting network diagnostic server listening on %s for commands", addr)
        srv := &http.Server{
                Addr:              addr,
                Handler:           s,
                ReadHeaderTimeout: 5 * time.Minute, // "G112: Potential Slowloris Attack (gosec)"; not a real concern for our use, so setting a long timeout.
        }
        s.srv = srv
        s.enable = true
        go func(n *Server) {
                // Ignore ErrServerClosed that is returned on the Shutdown call
                if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
                        log.G(context.TODO()).Errorf("ListenAndServe error: %s", err)
                        n.mu.Lock()
                        defer n.mu.Unlock()
                        n.enable = false
                }
        }(s)
}

// Shutdown stop the debug and closes the tcp socket
func (s *Server) Shutdown() {
        s.mu.Lock()
        defer s.mu.Unlock()
        if !s.enable {
                return
        }
        if s.srv != nil {
                if err := s.srv.Shutdown(context.Background()); err != nil {
                        log.G(context.TODO()).WithError(err).Warn("Error during network diagnostic server shutdown")
                }
                s.srv = nil
        }
        s.enable = false
        log.G(context.TODO()).Info("Network diagnostic server shutdown complete")
}

// Enabled returns true when the debug is enabled
func (s *Server) Enabled() bool {
        s.mu.Lock()
        defer s.mu.Unlock()
        return s.enable
}

func notImplemented(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        _, jsonOutput := ParseHTTPFormOptions(r)
        rsp := WrongCommand("not implemented", fmt.Sprintf("URL path: %s no method implemented check /help\n", r.URL.Path))

        // audit logs
        log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        }).Info("command not implemented done")

        _, _ = HTTPReply(w, rsp, jsonOutput)
}

func (s *Server) help(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        _, jsonOutput := ParseHTTPFormOptions(r)

        // audit logs
        log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        }).Info("help done")

        var result string
        s.mu.Lock()
        for path := range s.handlers {
                result += fmt.Sprintf("%s\n", path)
        }
        s.mu.Unlock()
        _, _ = HTTPReply(w, CommandSucceed(&StringCmd{Info: result}), jsonOutput)
}

func ready(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        _, jsonOutput := ParseHTTPFormOptions(r)

        // audit logs
        log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        }).Info("ready done")
        _, _ = HTTPReply(w, CommandSucceed(&StringCmd{Info: "OK"}), jsonOutput)
}

// DebugHTTPForm helper to print the form url parameters
func DebugHTTPForm(r *http.Request) {
        for k, v := range r.Form {
                log.G(context.TODO()).Debugf("Form[%q] = %q\n", k, v)
        }
}

// JSONOutput contains details on JSON output printing
type JSONOutput struct {
        enable      bool
        prettyPrint bool
}

// ParseHTTPFormOptions easily parse the JSON printing options
func ParseHTTPFormOptions(r *http.Request) (bool, *JSONOutput) {
        _, unsafe := r.Form["unsafe"]
        v, enableJSON := r.Form["json"]
        var pretty bool
        if len(v) > 0 {
                pretty = v[0] == "pretty"
        }
        return unsafe, &JSONOutput{enable: enableJSON, prettyPrint: pretty}
}

// HTTPReply helper function that takes care of sending the message out
func HTTPReply(w http.ResponseWriter, r *HTTPResult, j *JSONOutput) (int, error) {
        var response []byte
        if j.enable {
                w.Header().Set("Content-Type", "application/json")
                var err error
                if j.prettyPrint {
                        response, err = json.MarshalIndent(r, "", "  ")
                        if err != nil {
                                response, _ = json.MarshalIndent(FailCommand(err), "", "  ") //nolint:errchkjson // ignore "Error return value of `encoding/json.MarshalIndent` is not checked: unsafe type `StringInterface`"
                        }
                } else {
                        response, err = json.Marshal(r)
                        if err != nil {
                                response, _ = json.Marshal(FailCommand(err)) //nolint:errchkjson // ignore "Error return value of `encoding/json.MarshalIndent` is not checked: unsafe type `StringInterface`"
                        }
                }
        } else {
                response = []byte(r.String())
        }
        return fmt.Fprint(w, string(response))
}

package diagnostic

import "fmt"

// StringInterface interface that has to be implemented by messages
type StringInterface interface {
        String() string
}

// CommandSucceed creates a success message
func CommandSucceed(result StringInterface) *HTTPResult {
        return &HTTPResult{
                Message: "OK",
                Details: result,
        }
}

// FailCommand creates a failure message with error
func FailCommand(err error) *HTTPResult {
        return &HTTPResult{
                Message: "FAIL",
                Details: &ErrorCmd{Error: err.Error()},
        }
}

// WrongCommand creates a wrong command response
func WrongCommand(message, usage string) *HTTPResult {
        return &HTTPResult{
                Message: message,
                Details: &UsageCmd{Usage: usage},
        }
}

// HTTPResult Diagnostic Server HTTP result operation
type HTTPResult struct {
        Message string          `json:"message"`
        Details StringInterface `json:"details"`
}

func (h *HTTPResult) String() string {
        rsp := h.Message
        if h.Details != nil {
                rsp += "\n" + h.Details.String()
        }
        return rsp
}

// UsageCmd command with usage field
type UsageCmd struct {
        Usage string `json:"usage"`
}

func (u *UsageCmd) String() string {
        return "Usage: " + u.Usage
}

// StringCmd command with info string
type StringCmd struct {
        Info string `json:"info"`
}

func (s *StringCmd) String() string {
        return s.Info
}

// ErrorCmd command with error
type ErrorCmd struct {
        Error string `json:"error"`
}

func (e *ErrorCmd) String() string {
        return "Error: " + e.Error
}

// TableObj network db table object
type TableObj struct {
        Length   int               `json:"size"`
        Elements []StringInterface `json:"entries"`
}

func (t *TableObj) String() string {
        output := fmt.Sprintf("total entries: %d\n", t.Length)
        for _, e := range t.Elements {
                output += e.String()
        }
        return output
}

// PeerEntryObj entry in the networkdb peer table
type PeerEntryObj struct {
        Index int    `json:"-"`
        Name  string `json:"-=name"`
        IP    string `json:"ip"`
}

func (p *PeerEntryObj) String() string {
        return fmt.Sprintf("%d) %s -> %s\n", p.Index, p.Name, p.IP)
}

// TableEntryObj network db table entry object
type TableEntryObj struct {
        Index int    `json:"-"`
        Key   string `json:"key"`
        Value string `json:"value"`
        Owner string `json:"owner"`
}

func (t *TableEntryObj) String() string {
        return fmt.Sprintf("%d) k:`%s` -> v:`%s` owner:`%s`\n", t.Index, t.Key, t.Value, t.Owner)
}

// TableEndpointsResult fully typed message for proper unmarshaling on the client side
type TableEndpointsResult struct {
        TableObj
        Elements []TableEntryObj `json:"entries"`
}

// TablePeersResult fully typed message for proper unmarshaling on the client side
type TablePeersResult struct {
        TableObj
        Elements []PeerEntryObj `json:"entries"`
}

// NetworkStatsResult network db stats related to entries and queue len for a network
type NetworkStatsResult struct {
        Entries  int `json:"entries"`
        QueueLen int `jsoin:"qlen"`
}

func (n *NetworkStatsResult) String() string {
        return fmt.Sprintf("entries: %d, qlen: %d\n", n.Entries, n.QueueLen)
}

package driverapi

import (
        "context"
        "net"

        "github.com/docker/docker/daemon/libnetwork/options"
)

// NetworkPluginEndpointType represents the Endpoint Type used by Plugin system
const NetworkPluginEndpointType = "NetworkDriver"

// Driver is an interface that every plugin driver needs to implement.
type Driver interface {
        // NetworkAllocate invokes the driver method to allocate network
        // specific resources passing network id and network specific config.
        // It returns a key,value pair of network specific driver allocations
        // to the caller.
        NetworkAllocate(nid string, options map[string]string, ipV4Data, ipV6Data []IPAMData) (map[string]string, error)

        // NetworkFree invokes the driver method to free network specific resources
        // associated with a given network id.
        NetworkFree(nid string) error

        // CreateNetwork invokes the driver method to create a network
        // passing the network id and network specific config. The
        // config mechanism will eventually be replaced with labels
        // which are yet to be introduced. The driver can return a
        // list of table names for which it is interested in receiving
        // notification when a CRUD operation is performed on any
        // entry in that table. This will be ignored for local scope
        // drivers.
        CreateNetwork(ctx context.Context, nid string, options map[string]interface{}, nInfo NetworkInfo, ipV4Data, ipV6Data []IPAMData) error

        // DeleteNetwork invokes the driver method to delete network passing
        // the network id.
        DeleteNetwork(nid string) error

        // CreateEndpoint invokes the driver method to create an endpoint
        // passing the network id, endpoint id endpoint information and driver
        // specific config. The endpoint information can be either consumed by
        // the driver or populated by the driver. The config mechanism will
        // eventually be replaced with labels which are yet to be introduced.
        CreateEndpoint(ctx context.Context, nid, eid string, ifInfo InterfaceInfo, options map[string]interface{}) error

        // DeleteEndpoint invokes the driver method to delete an endpoint
        // passing the network id and endpoint id.
        DeleteEndpoint(nid, eid string) error

        // EndpointOperInfo retrieves from the driver the operational data related to the specified endpoint
        EndpointOperInfo(nid, eid string) (map[string]interface{}, error)

        // Join method is invoked when a Sandbox is attached to an endpoint.
        Join(ctx context.Context, nid, eid string, sboxKey string, jinfo JoinInfo, epOpts, sbOpts map[string]interface{}) error

        // Leave method is invoked when a Sandbox detaches from an endpoint.
        Leave(nid, eid string) error

        // EventNotify notifies the driver when a CRUD operation has
        // happened on a table of its interest as soon as this node
        // receives such an event in the gossip layer. This method is
        // only invoked for the global scope driver.
        EventNotify(event EventType, nid string, tableName string, key string, value []byte)

        // DecodeTableEntry passes the driver a key, value pair from table it registered
        // with libnetwork. Driver should return {object ID, map[string]string} tuple.
        // If DecodeTableEntry is called for a table associated with NetworkObject or
        // EndpointObject the return object ID should be the network id or endpoint id
        // associated with that entry. map should have information about the object that
        // can be presented to the user.
        // For example: overlay driver returns the VTEP IP of the host that has the endpoint
        // which is shown in 'network inspect --verbose'
        DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string)

        // Type returns the type of this driver, the network type this driver manages
        Type() string

        // IsBuiltIn returns true if it is a built-in driver
        IsBuiltIn() bool
}

// ExtConner is an optional interface for a network driver.
type ExtConner interface {
        // ProgramExternalConnectivity tells the driver the ids of the endpoints
        // currently acting as the container's default gateway for IPv4 and IPv6,
        // passed as gw4Id/gw6Id. (Those endpoints may be managed by different network
        // drivers. If there is no gateway, the id will be the empty string.)
        //
        // This method is called after Driver.Join, before Driver.Leave, and when eid
        // is or was equal to gw4Id or gw6Id, and there's a change. It may also be
        // called when the gateways have not changed.
        //
        // When an endpoint acting as a gateway is deleted, this function is called
        // with that endpoint's id in eid, and empty gateway ids (even if another
        // is present and will shortly be selected as the gateway).
        ProgramExternalConnectivity(ctx context.Context, nid, eid string, gw4Id, gw6Id string) error
}

// GwAllocChecker is an optional interface for a network driver.
type GwAllocChecker interface {
        // GetSkipGwAlloc returns true if the opts describe a network
        // that does not need a gateway IPv4/IPv6 address, else false.
        GetSkipGwAlloc(opts options.Generic) (skipIPv4, skipIPv6 bool, err error)
}

// NetworkInfo provides a go interface for drivers to provide network
// specific information to libnetwork.
type NetworkInfo interface {
        // TableEventRegister registers driver interest in a given
        // table name.
        TableEventRegister(tableName string, objType ObjectType) error

        // UpdateIpamConfig updates the networks IPAM configuration
        // based on information from the driver.  In windows, the OS (HNS) chooses
        // the IP address space if the user does not specify an address space.
        UpdateIpamConfig(ipV4Data []IPAMData)
}

// InterfaceInfo provides a go interface for drivers to retrieve
// network information to interface resources.
type InterfaceInfo interface {
        // SetMacAddress allows the driver to set the mac address to the endpoint interface
        // during the call to CreateEndpoint, if the mac address is not already set.
        SetMacAddress(mac net.HardwareAddr) error

        // SetIPAddress allows the driver to set the ip address to the endpoint interface
        // during the call to CreateEndpoint, if the address is not already set.
        // The API is to be used to assign both the IPv4 and IPv6 address types.
        SetIPAddress(ip *net.IPNet) error

        // MacAddress returns the MAC address.
        MacAddress() net.HardwareAddr

        // Address returns the IPv4 address.
        Address() *net.IPNet

        // AddressIPv6 returns the IPv6 address.
        AddressIPv6() *net.IPNet

        // NetnsPath returns the path of the network namespace, if there is one. Else "".
        NetnsPath() string

        // SetCreatedInContainer can be called by the driver to indicate that it's
        // created the network interface in the container's network namespace (so,
        // it doesn't need to be moved there).
        SetCreatedInContainer(bool)
}

// InterfaceNameInfo provides a go interface for the drivers to assign names
// to interfaces.
type InterfaceNameInfo interface {
        // SetNames method assigns the srcName, dstPrefix, and dstName for the
        // interface. If both dstName and dstPrefix are set, dstName takes
        // precedence.
        SetNames(srcName, dstPrefix, dstName string) error
}

// JoinInfo represents a set of resources that the driver has the ability to provide during
// join time.
type JoinInfo interface {
        // InterfaceName returns an InterfaceNameInfo go interface to facilitate
        // setting the names for the interface.
        InterfaceName() InterfaceNameInfo

        // SetGateway sets the default IPv4 gateway when a container joins the endpoint.
        SetGateway(net.IP) error

        // SetGatewayIPv6 sets the default IPv6 gateway when a container joins the endpoint.
        SetGatewayIPv6(net.IP) error

        // AddStaticRoute adds a route to the sandbox.
        // It may be used in addition to or instead of a default gateway (as above).
        AddStaticRoute(destination *net.IPNet, routeType int, nextHop net.IP) error

        // DisableGatewayService tells libnetwork not to provide Default GW for the container
        DisableGatewayService()

        // AddTableEntry adds a table entry to the gossip layer
        // passing the table name, key and an opaque value.
        AddTableEntry(tableName string, key string, value []byte) error
}

// Registerer provides a way for network drivers to be dynamically registered.
type Registerer interface {
        RegisterDriver(name string, driver Driver, capability Capability) error
}

// Capability represents the high level capabilities of the drivers which libnetwork can make use of
type Capability struct {
        DataScope         string
        ConnectivityScope string
}

// IPAMData represents the per-network ip related
// operational information libnetwork will send
// to the network driver during CreateNetwork()
type IPAMData struct {
        AddressSpace string
        Pool         *net.IPNet
        Gateway      *net.IPNet
        AuxAddresses map[string]*net.IPNet
}

// EventType defines a type for the CRUD event
type EventType uint8

const (
        // Create event is generated when a table entry is created,
        Create EventType = 1 + iota
        // Update event is generated when a table entry is updated.
        Update
        // Delete event is generated when a table entry is deleted.
        Delete
)

// ObjectType represents the type of object driver wants to store in libnetwork's networkDB
type ObjectType int

const (
        // EndpointObject should be set for libnetwork endpoint object related data
        EndpointObject ObjectType = 1 + iota
        // NetworkObject should be set for libnetwork network object related data
        NetworkObject
        // OpaqueObject is for driver specific data with no corresponding libnetwork object
        OpaqueObject
)

// IsValidType validates the passed in type against the valid object types
func IsValidType(objType ObjectType) bool {
        switch objType {
        case EndpointObject:
                fallthrough
        case NetworkObject:
                fallthrough
        case OpaqueObject:
                return true
        }
        return false
}

package driverapi

import (
        "fmt"
)

// ErrNoNetwork is returned if no network with the specified id exists
type ErrNoNetwork string

func (enn ErrNoNetwork) Error() string {
        return fmt.Sprintf("No network (%s) exists", string(enn))
}

// NotFound denotes the type of this error
func (enn ErrNoNetwork) NotFound() {}

// ErrEndpointExists is returned if more than one endpoint is added to the network
type ErrEndpointExists string

func (ee ErrEndpointExists) Error() string {
        return fmt.Sprintf("Endpoint (%s) already exists (Only one endpoint allowed)", string(ee))
}

// Forbidden denotes the type of this error
func (ee ErrEndpointExists) Forbidden() {}

// ErrNotImplemented is returned when a Driver has not implemented an API yet
type ErrNotImplemented struct{}

func (eni *ErrNotImplemented) Error() string {
        return "The API is not implemented yet"
}

// NotImplemented denotes the type of this error
func (eni *ErrNotImplemented) NotImplemented() {}

// ErrNoEndpoint is returned if no endpoint with the specified id exists
type ErrNoEndpoint string

func (ene ErrNoEndpoint) Error() string {
        return fmt.Sprintf("No endpoint (%s) exists", string(ene))
}

// NotFound denotes the type of this error
func (ene ErrNoEndpoint) NotFound() {}

// ErrActiveRegistration represents an error when a driver is registered to a networkType that is previously registered
type ErrActiveRegistration string

// Error interface for ErrActiveRegistration
func (ar ErrActiveRegistration) Error() string {
        return fmt.Sprintf("Driver already registered for type %q", string(ar))
}

// Forbidden denotes the type of this error
func (ar ErrActiveRegistration) Forbidden() {}

package driverapi

import (
        "encoding/json"
        "fmt"
        "net"

        "github.com/docker/docker/daemon/libnetwork/types"
)

// MarshalJSON encodes IPAMData into json message
func (i *IPAMData) MarshalJSON() ([]byte, error) {
        m := map[string]interface{}{}
        m["AddressSpace"] = i.AddressSpace
        if i.Pool != nil {
                m["Pool"] = i.Pool.String()
        }
        if i.Gateway != nil {
                m["Gateway"] = i.Gateway.String()
        }
        if i.AuxAddresses != nil {
                am := make(map[string]string, len(i.AuxAddresses))
                for k, v := range i.AuxAddresses {
                        am[k] = v.String()
                }
                m["AuxAddresses"] = am
        }
        return json.Marshal(m)
}

// UnmarshalJSON decodes a json message into IPAMData
func (i *IPAMData) UnmarshalJSON(data []byte) error {
        var (
                m   map[string]interface{}
                err error
        )
        if err := json.Unmarshal(data, &m); err != nil {
                return err
        }
        i.AddressSpace = m["AddressSpace"].(string)
        if v, ok := m["Pool"]; ok {
                if i.Pool, err = types.ParseCIDR(v.(string)); err != nil {
                        return err
                }
        }
        if v, ok := m["Gateway"]; ok {
                if i.Gateway, err = types.ParseCIDR(v.(string)); err != nil {
                        return err
                }
        }
        if v, ok := m["AuxAddresses"]; ok {
                b, _ := json.Marshal(v) //nolint:errchkjson // FIXME: Error return value of unsafe type `interface{}` is unchecked (errchkjson)
                var am map[string]string
                if err = json.Unmarshal(b, &am); err != nil {
                        return err
                }
                i.AuxAddresses = make(map[string]*net.IPNet, len(am))
                for k, v := range am {
                        if i.AuxAddresses[k], err = types.ParseCIDR(v); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// Validate checks whether the IPAMData structure contains congruent data
func (i *IPAMData) Validate() error {
        var isV6 bool
        if i.Pool == nil {
                return types.InvalidParameterErrorf("invalid pool")
        }
        if i.Gateway == nil {
                return types.InvalidParameterErrorf("invalid gateway address")
        }
        isV6 = i.IsV6()
        if isV6 && i.Gateway.IP.To4() != nil || !isV6 && i.Gateway.IP.To4() == nil {
                return types.InvalidParameterErrorf("incongruent ip versions for pool and gateway")
        }
        for k, sip := range i.AuxAddresses {
                if isV6 && sip.IP.To4() != nil || !isV6 && sip.IP.To4() == nil {
                        return types.InvalidParameterErrorf("incongruent ip versions for pool and secondary ip address %s", k)
                }
        }
        if !i.Pool.Contains(i.Gateway.IP) {
                return types.InvalidParameterErrorf("invalid gateway address (%s) does not belong to the pool (%s)", i.Gateway, i.Pool)
        }
        for k, sip := range i.AuxAddresses {
                if !i.Pool.Contains(sip.IP) {
                        return types.InvalidParameterErrorf("invalid secondary address %s (%s) does not belong to the pool (%s)", k, i.Gateway, i.Pool)
                }
        }
        return nil
}

// IsV6 returns whether this is an IPv6 IPAMData structure
func (i *IPAMData) IsV6() bool {
        return i.Pool.IP.To4() == nil
}

func (i *IPAMData) String() string {
        return fmt.Sprintf("AddressSpace: %s\nPool: %v\nGateway: %v\nAddresses: %v", i.AddressSpace, i.Pool, i.Gateway, i.AuxAddresses)
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package bridge

import (
        "context"
        "fmt"
        "net"
        "net/netip"
        "os"
        "slices"
        "strconv"
        "strings"
        "sync"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/iptabler"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/nftabler"
        "github.com/docker/docker/daemon/libnetwork/drvregistry"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
        "github.com/docker/docker/daemon/libnetwork/iptables"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/options"
        "github.com/docker/docker/daemon/libnetwork/portmapperapi"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/docker/docker/internal/otelutil"
        "github.com/docker/docker/internal/sliceutil"
        "github.com/docker/docker/pkg/stringid"
        "github.com/pkg/errors"
        "github.com/vishvananda/netlink"
        "github.com/vishvananda/netns"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

const (
        NetworkType                = "bridge"
        vethPrefix                 = "veth"
        vethLen                    = len(vethPrefix) + 7
        defaultContainerVethPrefix = "eth"
)

const (
        // DefaultGatewayV4AuxKey represents the default-gateway configured by the user
        DefaultGatewayV4AuxKey = "DefaultGatewayIPv4"
        // DefaultGatewayV6AuxKey represents the ipv6 default-gateway configured by the user
        DefaultGatewayV6AuxKey = "DefaultGatewayIPv6"
)

const spanPrefix = "libnetwork.drivers.bridge"

// DockerForwardChain is where libnetwork.programIngress puts Swarm's jump to DOCKER-INGRESS.
//
// FIXME(robmry) - it doesn't belong here.
const DockerForwardChain = iptabler.DockerForwardChain

// configuration info for the "bridge" driver.
type configuration struct {
        EnableIPForwarding       bool
        DisableFilterForwardDrop bool
        EnableIPTables           bool
        EnableIP6Tables          bool
        // Hairpin indicates whether packets sent from a container to a host port
        // published by another container on the same bridge network should be
        // hairpinned.
        Hairpin            bool
        AllowDirectRouting bool
}

// networkConfiguration for network specific configuration
type networkConfiguration struct {
        ID                    string
        BridgeName            string
        EnableIPv4            bool
        EnableIPv6            bool
        EnableIPMasquerade    bool
        GwModeIPv4            gwMode
        GwModeIPv6            gwMode
        EnableICC             bool
        TrustedHostInterfaces []string // Interface names must not contain ':' characters
        InhibitIPv4           bool
        Mtu                   int
        DefaultBindingIP      net.IP
        DefaultBridge         bool
        HostIPv4              net.IP
        HostIPv6              net.IP
        ContainerIfacePrefix  string
        // Internal fields set after ipam data parsing
        AddressIPv4        *net.IPNet
        AddressIPv6        *net.IPNet
        DefaultGatewayIPv4 net.IP
        DefaultGatewayIPv6 net.IP
        dbIndex            uint64
        dbExists           bool
        Internal           bool

        BridgeIfaceCreator ifaceCreator
}

// ifaceCreator represents how the bridge interface was created
type ifaceCreator int8

const (
        ifaceCreatorUnknown ifaceCreator = iota
        ifaceCreatedByLibnetwork
        ifaceCreatedByUser
)

// containerConfiguration represents the user specified configuration for a container
type containerConfiguration struct {
        ParentEndpoints []string
        ChildEndpoints  []string
}

// connectivityConfiguration represents the user specified configuration regarding the external connectivity
type connectivityConfiguration struct {
        PortBindings []portmapperapi.PortBindingReq
        ExposedPorts []types.TransportPort
        NoProxy6To4  bool
}

type bridgeEndpoint struct {
        id               string
        nid              string
        srcName          string
        addr             *net.IPNet
        addrv6           *net.IPNet
        macAddress       net.HardwareAddr
        containerConfig  *containerConfiguration
        extConnConfig    *connectivityConfiguration
        portMapping      []portmapperapi.PortBinding // Operational port bindings
        portBindingState portBindingMode             // Not persisted, even on live-restore port mappings are re-created.
        dbIndex          uint64
        dbExists         bool
}

type bridgeNetwork struct {
        id                string
        bridge            *bridgeInterface // The bridge's L3 interface
        config            *networkConfiguration
        endpoints         map[string]*bridgeEndpoint // key: endpoint id
        driver            *driver                    // The network's driver
        firewallerNetwork firewaller.Network
        sync.Mutex
}

type driver struct {
        config        configuration
        networks      map[string]*bridgeNetwork
        store         *datastore.Store
        nlh           nlwrap.Handle
        configNetwork sync.Mutex
        firewaller    firewaller.Firewaller
        portmappers   *drvregistry.PortMappers
        sync.Mutex
}

type gwMode string

const (
        gwModeDefault   gwMode = ""
        gwModeNAT       gwMode = "nat"
        gwModeNATUnprot gwMode = "nat-unprotected"
        gwModeRouted    gwMode = "routed"
        gwModeIsolated  gwMode = "isolated"
)

// New constructs a new bridge driver
func newDriver(store *datastore.Store, pms *drvregistry.PortMappers) *driver {
        return &driver{
                store:       store,
                nlh:         ns.NlHandle(),
                networks:    map[string]*bridgeNetwork{},
                portmappers: pms,
        }
}

// Register registers a new instance of bridge driver.
func Register(r driverapi.Registerer, store *datastore.Store, pms *drvregistry.PortMappers, config map[string]interface{}) error {
        d := newDriver(store, pms)
        if err := d.configure(config); err != nil {
                return err
        }
        return r.RegisterDriver(NetworkType, d, driverapi.Capability{
                DataScope:         scope.Local,
                ConnectivityScope: scope.Local,
        })
}

// The behaviour of previous implementations of bridge subnet prefix assignment
// is preserved here...
//
// The LL prefix, 'fe80::/64' can be used as an IPAM pool. Linux always assigns
// link-local addresses with this prefix. But, pool-assigned addresses are very
// unlikely to conflict.
//
// Don't allow a nonstandard LL subnet to overlap with 'fe80::/64'. For example,
// if the config asked for subnet prefix 'fe80::/80', the bridge and its
// containers would each end up with two LL addresses, Linux's '/64' and one from
// the IPAM pool claiming '/80'. Although the specified prefix length must not
// affect the host's determination of whether the address is on-link and to be
// added to the interface's Prefix List (RFC-5942), differing prefix lengths
// would be confusing and have been disallowed by earlier implementations of
// bridge address assignment.
func validateIPv6Subnet(addr netip.Prefix) error {
        if !addr.Addr().Is6() || addr.Addr().Is4In6() {
                return fmt.Errorf("'%s' is not a valid IPv6 subnet", addr)
        }
        if addr.Addr().IsMulticast() {
                return fmt.Errorf("multicast subnet '%s' is not allowed", addr)
        }
        if addr.Masked() != linkLocalPrefix && linkLocalPrefix.Overlaps(addr) {
                return fmt.Errorf("'%s' clashes with the Link-Local prefix 'fe80::/64'", addr)
        }
        return nil
}

// ValidateFixedCIDRV6 checks that val is an IPv6 address and prefix length that
// does not overlap with the link local subnet prefix 'fe80::/64'.
func ValidateFixedCIDRV6(val string) error {
        if val == "" {
                return nil
        }
        prefix, err := netip.ParsePrefix(val)
        if err == nil {
                err = validateIPv6Subnet(prefix)
        }
        return errdefs.InvalidParameter(errors.Wrap(err, "invalid fixed-cidr-v6"))
}

// Validate performs a static validation on the network configuration parameters.
// Whatever can be assessed a priori before attempting any programming.
func (ncfg *networkConfiguration) Validate() error {
        if ncfg.Mtu < 0 {
                return errdefs.InvalidParameter(fmt.Errorf("invalid MTU number: %d", ncfg.Mtu))
        }

        if ncfg.EnableIPv4 {
                // If IPv4 is enabled, AddressIPv4 must have been configured.
                if ncfg.AddressIPv4 == nil {
                        return errdefs.System(errors.New("no IPv4 address was allocated for the bridge"))
                }
                // If default gw is specified, it must be part of bridge subnet
                if ncfg.DefaultGatewayIPv4 != nil {
                        if !ncfg.AddressIPv4.Contains(ncfg.DefaultGatewayIPv4) {
                                return errInvalidGateway
                        }
                }
        }

        if ncfg.EnableIPv6 {
                // If IPv6 is enabled, AddressIPv6 must have been configured.
                if ncfg.AddressIPv6 == nil {
                        return errdefs.System(errors.New("no IPv6 address was allocated for the bridge"))
                }
                // AddressIPv6 must be IPv6, and not overlap with the LL subnet prefix.
                addr, ok := netiputil.ToPrefix(ncfg.AddressIPv6)
                if !ok {
                        return errdefs.InvalidParameter(fmt.Errorf("invalid IPv6 address '%s'", ncfg.AddressIPv6))
                }
                if err := validateIPv6Subnet(addr); err != nil {
                        return errdefs.InvalidParameter(err)
                }
                // If a default gw is specified, it must belong to AddressIPv6's subnet
                if ncfg.DefaultGatewayIPv6 != nil && !ncfg.AddressIPv6.Contains(ncfg.DefaultGatewayIPv6) {
                        return errInvalidGateway
                }
        }

        return nil
}

// Conflicts check if two NetworkConfiguration objects overlap
func (ncfg *networkConfiguration) Conflicts(o *networkConfiguration) error {
        if o == nil {
                return errors.New("same configuration")
        }

        // Also empty, because only one network with empty name is allowed
        if ncfg.BridgeName == o.BridgeName {
                return errors.New("networks have same bridge name")
        }

        // They must be in different subnets
        if (ncfg.AddressIPv4 != nil && o.AddressIPv4 != nil) &&
                (ncfg.AddressIPv4.Contains(o.AddressIPv4.IP) || o.AddressIPv4.Contains(ncfg.AddressIPv4.IP)) {
                return errors.New("networks have overlapping IPv4")
        }

        // They must be in different v6 subnets
        if (ncfg.AddressIPv6 != nil && o.AddressIPv6 != nil) &&
                (ncfg.AddressIPv6.Contains(o.AddressIPv6.IP) || o.AddressIPv6.Contains(ncfg.AddressIPv6.IP)) {
                return errors.New("networks have overlapping IPv6")
        }

        return nil
}

func (ncfg *networkConfiguration) fromLabels(labels map[string]string) error {
        var err error
        for label, value := range labels {
                switch label {
                case BridgeName:
                        ncfg.BridgeName = value
                case netlabel.DriverMTU:
                        if ncfg.Mtu, err = strconv.Atoi(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case netlabel.EnableIPv4:
                        if ncfg.EnableIPv4, err = strconv.ParseBool(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case netlabel.EnableIPv6:
                        if ncfg.EnableIPv6, err = strconv.ParseBool(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case EnableIPMasquerade:
                        if ncfg.EnableIPMasquerade, err = strconv.ParseBool(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case IPv4GatewayMode:
                        if ncfg.GwModeIPv4, err = newGwMode(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case IPv6GatewayMode:
                        if ncfg.GwModeIPv6, err = newGwMode(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case EnableICC:
                        if ncfg.EnableICC, err = strconv.ParseBool(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case TrustedHostInterfaces:
                        ncfg.TrustedHostInterfaces = strings.FieldsFunc(value, func(r rune) bool { return r == ':' })
                case InhibitIPv4:
                        if ncfg.InhibitIPv4, err = strconv.ParseBool(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case DefaultBridge:
                        if ncfg.DefaultBridge, err = strconv.ParseBool(value); err != nil {
                                return parseErr(label, value, err.Error())
                        }
                case DefaultBindingIP:
                        if ncfg.DefaultBindingIP = net.ParseIP(value); ncfg.DefaultBindingIP == nil {
                                return parseErr(label, value, "nil ip")
                        }
                case netlabel.ContainerIfacePrefix:
                        ncfg.ContainerIfacePrefix = value
                case netlabel.HostIPv4:
                        if ncfg.HostIPv4 = net.ParseIP(value); ncfg.HostIPv4 == nil {
                                return parseErr(label, value, "nil ip")
                        }
                case netlabel.HostIPv6:
                        if ncfg.HostIPv6 = net.ParseIP(value); ncfg.HostIPv6 == nil {
                                return parseErr(label, value, "nil ip")
                        }
                }
        }

        return nil
}

func newGwMode(gwMode string) (gwMode, error) {
        switch gwMode {
        case "nat":
                return gwModeNAT, nil
        case "nat-unprotected":
                return gwModeNATUnprot, nil
        case "routed":
                return gwModeRouted, nil
        case "isolated":
                return gwModeIsolated, nil
        }
        return gwModeDefault, fmt.Errorf("unknown gateway mode %s", gwMode)
}

func (m gwMode) routed() bool {
        return m == gwModeRouted
}

func (m gwMode) unprotected() bool {
        return m == gwModeNATUnprot
}

func (m gwMode) isolated() bool {
        return m == gwModeIsolated
}

func parseErr(label, value, errString string) error {
        return types.InvalidParameterErrorf("failed to parse %s value: %v (%s)", label, value, errString)
}

func (n *bridgeNetwork) newFirewallerNetwork(ctx context.Context) (_ firewaller.Network, retErr error) {
        config4, err := makeNetworkConfigFam(n.config.HostIPv4, n.bridge.bridgeIPv4, n.gwMode(firewaller.IPv4))
        if err != nil {
                return nil, err
        }
        config6, err := makeNetworkConfigFam(n.config.HostIPv6, n.bridge.bridgeIPv6, n.gwMode(firewaller.IPv6))
        if err != nil {
                return nil, err
        }

        if err := iptables.AddInterfaceFirewalld(n.config.BridgeName); err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        if err := iptables.DelInterfaceFirewalld(n.config.BridgeName); err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to delete network level rules following error")
                        }
                }
        }()

        return n.driver.firewaller.NewNetwork(ctx, firewaller.NetworkConfig{
                IfName:                n.config.BridgeName,
                Internal:              n.config.Internal,
                ICC:                   n.config.EnableICC,
                Masquerade:            n.config.EnableIPMasquerade,
                TrustedHostInterfaces: n.config.TrustedHostInterfaces,
                Config4:               config4,
                Config6:               config6,
        })
}

func makeNetworkConfigFam(hostIP net.IP, bridgePrefix *net.IPNet, gwm gwMode) (firewaller.NetworkConfigFam, error) {
        c := firewaller.NetworkConfigFam{
                Routed:      gwm.routed(),
                Unprotected: gwm.unprotected(),
        }
        if hostIP != nil {
                var ok bool
                c.HostIP, ok = netip.AddrFromSlice(hostIP)
                if !ok {
                        return firewaller.NetworkConfigFam{}, fmt.Errorf("invalid host address for pktFilter %q", hostIP)
                }
                c.HostIP = c.HostIP.Unmap()
        }
        if bridgePrefix != nil {
                p, ok := netiputil.ToPrefix(bridgePrefix)
                if !ok {
                        return firewaller.NetworkConfigFam{}, fmt.Errorf("invalid bridge prefix for pktFilter %s", bridgePrefix)
                }
                c.Prefix = p.Masked()
        }
        return c, nil
}

func (n *bridgeNetwork) getNATDisabled() (ipv4, ipv6 bool) {
        n.Lock()
        defer n.Unlock()
        return n.config.GwModeIPv4.routed(), n.config.GwModeIPv6.routed()
}

func (n *bridgeNetwork) gwMode(v firewaller.IPVersion) gwMode {
        n.Lock()
        defer n.Unlock()
        if v == firewaller.IPv4 {
                return n.config.GwModeIPv4
        }
        return n.config.GwModeIPv6
}

func (n *bridgeNetwork) hairpin() bool {
        n.Lock()
        defer n.Unlock()
        if n.driver == nil {
                return false
        }
        return n.driver.config.Hairpin
}

func (n *bridgeNetwork) portMappers() *drvregistry.PortMappers {
        n.Lock()
        defer n.Unlock()
        if n.driver == nil {
                return nil
        }
        return n.driver.portmappers
}

func (n *bridgeNetwork) getEndpoint(eid string) (*bridgeEndpoint, error) {
        if eid == "" {
                return nil, invalidEndpointIDError(eid)
        }

        n.Lock()
        defer n.Unlock()
        if ep, ok := n.endpoints[eid]; ok {
                return ep, nil
        }

        return nil, nil
}

func (d *driver) configure(option map[string]interface{}) error {
        var config configuration
        switch opt := option[netlabel.GenericData].(type) {
        case options.Generic:
                opaqueConfig, err := options.GenerateFromModel(opt, &configuration{})
                if err != nil {
                        return err
                }
                config = *opaqueConfig.(*configuration)
        case *configuration:
                config = *opt
        case nil:
                // No GenericData option set. Use defaults.
        default:
                return errdefs.InvalidParameter(fmt.Errorf("invalid configuration type (%T) passed", opt))
        }

        var err error
        d.firewaller, err = newFirewaller(context.Background(), firewaller.Config{
                IPv4:               config.EnableIPTables,
                IPv6:               config.EnableIP6Tables,
                Hairpin:            config.Hairpin,
                AllowDirectRouting: config.AllowDirectRouting,
                WSL2Mirrored:       isRunningUnderWSL2MirroredMode(context.Background()),
        })
        if err != nil {
                return err
        }

        d.Lock()
        d.config = config
        d.Unlock()

        // Register for an event when firewalld is reloaded, but take the config lock so
        // that events won't be processed until the initial load from Store is complete.
        d.configNetwork.Lock()
        defer d.configNetwork.Unlock()
        iptables.OnReloaded(d.handleFirewalldReload)

        return d.initStore()
}

var newFirewaller = func(ctx context.Context, config firewaller.Config) (firewaller.Firewaller, error) {
        if nftables.Enabled() {
                fw, err := nftabler.NewNftabler(ctx, config)
                if err != nil {
                        return nil, err
                }
                // Without seeing config (interface names, addresses, and so on), the iptabler's
                // cleaner can't clean up network or port-specific rules that may have been added
                // to iptables built-in chains. So, if cleanup is needed, give the cleaner to
                // the nftabler. Then, it'll use it to delete old rules as networks are restored.
                fw.(firewaller.FirewallCleanerSetter).SetFirewallCleaner(iptabler.NewCleaner(ctx, config))
                return fw, nil
        }

        // The nftabler can clean all of its rules in one go. So, even if there's cleanup
        // to do, there's no need to pass a cleaner to the iptabler.
        nftabler.Cleanup(ctx, config)
        return iptabler.NewIptabler(ctx, config)
}

func (d *driver) getNetwork(id string) (*bridgeNetwork, error) {
        d.Lock()
        defer d.Unlock()

        if id == "" {
                return nil, types.InvalidParameterErrorf("invalid network id: %s", id)
        }

        if nw, ok := d.networks[id]; ok {
                return nw, nil
        }

        return nil, types.NotFoundErrorf("network not found: %s", id)
}

func parseNetworkGenericOptions(data interface{}) (*networkConfiguration, error) {
        var (
                err    error
                config *networkConfiguration
        )

        switch opt := data.(type) {
        case *networkConfiguration:
                config = opt
        case map[string]string:
                config = &networkConfiguration{
                        EnableICC:          true,
                        EnableIPMasquerade: true,
                }
                err = config.fromLabels(opt)
        default:
                err = types.InvalidParameterErrorf("do not recognize network configuration format: %T", opt)
        }

        return config, err
}

func (ncfg *networkConfiguration) processIPAM(ipamV4Data, ipamV6Data []driverapi.IPAMData) error {
        if len(ipamV4Data) > 1 || len(ipamV6Data) > 1 {
                return types.ForbiddenErrorf("bridge driver doesn't support multiple subnets")
        }

        if len(ipamV4Data) > 0 {
                ncfg.AddressIPv4 = ipamV4Data[0].Pool

                if ipamV4Data[0].Gateway != nil {
                        ncfg.AddressIPv4 = types.GetIPNetCopy(ipamV4Data[0].Gateway)
                }

                if gw, ok := ipamV4Data[0].AuxAddresses[DefaultGatewayV4AuxKey]; ok {
                        ncfg.DefaultGatewayIPv4 = gw.IP
                }
        }

        if len(ipamV6Data) > 0 {
                ncfg.AddressIPv6 = ipamV6Data[0].Pool

                if ipamV6Data[0].Gateway != nil {
                        ncfg.AddressIPv6 = types.GetIPNetCopy(ipamV6Data[0].Gateway)
                }

                if gw, ok := ipamV6Data[0].AuxAddresses[DefaultGatewayV6AuxKey]; ok {
                        ncfg.DefaultGatewayIPv6 = gw.IP
                }
        }

        return nil
}

func parseNetworkOptions(id string, option options.Generic) (*networkConfiguration, error) {
        var (
                err    error
                config = &networkConfiguration{}
        )

        // Parse generic label first, config will be re-assigned
        if genData, ok := option[netlabel.GenericData]; ok && genData != nil {
                if config, err = parseNetworkGenericOptions(genData); err != nil {
                        return nil, err
                }
        }

        // Process well-known labels next
        if val, ok := option[netlabel.EnableIPv4]; ok {
                config.EnableIPv4 = val.(bool)
        }
        if val, ok := option[netlabel.EnableIPv6]; ok {
                config.EnableIPv6 = val.(bool)
        }

        if val, ok := option[netlabel.Internal]; ok {
                if internal, ok := val.(bool); ok && internal {
                        config.Internal = true
                }
        }

        if config.BridgeName == "" && !config.DefaultBridge {
                config.BridgeName = "br-" + id[:12]
        }

        exists, err := bridgeInterfaceExists(config.BridgeName)
        if err != nil {
                return nil, err
        }

        if (config.GwModeIPv4.isolated() || config.GwModeIPv6.isolated()) && !config.Internal {
                return nil, errors.New("gateway mode 'isolated' can only be used for an internal network")
        }

        if !exists {
                config.BridgeIfaceCreator = ifaceCreatedByLibnetwork
        } else {
                config.BridgeIfaceCreator = ifaceCreatedByUser
        }

        config.ID = id
        return config, nil
}

// Return a slice of networks over which caller can iterate safely
func (d *driver) getNetworks() []*bridgeNetwork {
        d.Lock()
        defer d.Unlock()

        ls := make([]*bridgeNetwork, 0, len(d.networks))
        for _, nw := range d.networks {
                ls = append(ls, nw)
        }
        return ls
}

func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
        return nil, types.NotImplementedErrorf("not implemented")
}

func (d *driver) NetworkFree(id string) error {
        return types.NotImplementedErrorf("not implemented")
}

func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key string, value []byte) {
}

func (d *driver) DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string) {
        return "", nil
}

func (d *driver) GetSkipGwAlloc(opts options.Generic) (ipv4, ipv6 bool, _ error) {
        // The network doesn't exist yet, so use a dummy id that's long enough to be
        // truncated to a short-id (12 characters) and used in the bridge device name.
        cfg, err := parseNetworkOptions("dummyNetworkId", opts)
        if err != nil {
                return false, false, err
        }
        // An isolated network should not have a gateway. Also, cfg.InhibitIPv4 means no
        // gateway address will be assigned to the bridge. So, if the network is also
        // cfg.Internal, there will not be a default route to use the gateway address.
        ipv4 = cfg.GwModeIPv4.isolated() || (cfg.InhibitIPv4 && cfg.Internal)
        ipv6 = cfg.GwModeIPv6.isolated()
        return ipv4, ipv6, nil
}

// CreateNetwork creates a new network using the bridge driver.
func (d *driver) CreateNetwork(ctx context.Context, id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
        // Sanity checks
        d.Lock()
        if _, ok := d.networks[id]; ok {
                d.Unlock()
                return types.ForbiddenErrorf("network %s exists", id)
        }
        d.Unlock()

        // Parse the config.
        config, err := parseNetworkOptions(id, option)
        if err != nil {
                return err
        }

        if !config.EnableIPv4 && !config.EnableIPv6 {
                return types.InvalidParameterErrorf("IPv4 or IPv6 must be enabled")
        }
        if config.EnableIPv4 && (len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0") {
                return types.InvalidParameterErrorf("ipv4 pool is empty")
        }
        if config.EnableIPv6 && (len(ipV6Data) == 0 || ipV6Data[0].Pool.String() == "::/0") {
                return types.InvalidParameterErrorf("ipv6 pool is empty")
        }

        // Add IP addresses/gateways to the configuration.
        if err = config.processIPAM(ipV4Data, ipV6Data); err != nil {
                return err
        }

        // Validate the configuration
        if err = config.Validate(); err != nil {
                return err
        }

        // start the critical section, from this point onward we are dealing with the list of networks
        // so to be consistent we cannot allow that the list changes
        d.configNetwork.Lock()
        defer d.configNetwork.Unlock()

        // check network conflicts
        if err = d.checkConflict(config); err != nil {
                return err
        }

        // there is no conflict, now create the network
        if err = d.createNetwork(ctx, config); err != nil {
                return err
        }

        return d.storeUpdate(ctx, config)
}

func (d *driver) checkConflict(config *networkConfiguration) error {
        networkList := d.getNetworks()
        for _, nw := range networkList {
                nw.Lock()
                nwConfig := nw.config
                nw.Unlock()
                if err := nwConfig.Conflicts(config); err != nil {
                        return types.ForbiddenErrorf("cannot create network %s (%s): conflicts with network %s (%s): %s",
                                config.ID, config.BridgeName, nwConfig.ID, nwConfig.BridgeName, err.Error())
                }
        }
        return nil
}

func (d *driver) createNetwork(ctx context.Context, config *networkConfiguration) (err error) {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".createNetwork", trace.WithAttributes(
                attribute.Bool("bridge.enable_ipv4", config.EnableIPv4),
                attribute.Bool("bridge.enable_ipv6", config.EnableIPv6),
                attribute.Bool("bridge.icc", config.EnableICC),
                attribute.Int("bridge.mtu", config.Mtu),
                attribute.Bool("bridge.internal", config.Internal)))
        defer func() {
                otelutil.RecordStatus(span, err)
                span.End()
        }()

        // Create or retrieve the bridge L3 interface
        bridgeIface, err := newInterface(d.nlh, config)
        if err != nil {
                return err
        }

        // Create and set network handler in driver
        network := &bridgeNetwork{
                id:        config.ID,
                endpoints: make(map[string]*bridgeEndpoint),
                config:    config,
                bridge:    bridgeIface,
                driver:    d,
        }

        d.Lock()
        d.networks[config.ID] = network
        d.Unlock()

        // On failure make sure to reset driver network handler to nil
        defer func() {
                if err != nil {
                        d.Lock()
                        delete(d.networks, config.ID)
                        d.Unlock()
                }
        }()

        // Prepare the bridge setup configuration
        bridgeSetup := newBridgeSetup(config, bridgeIface)

        // If the bridge interface doesn't exist, we need to start the setup steps
        // by creating a new device and assigning it an IPv4 address.
        bridgeAlreadyExists := bridgeIface.exists()
        if !bridgeAlreadyExists {
                bridgeSetup.queueStep("setupDevice", setupDevice)
                bridgeSetup.queueStep("setupDefaultSysctl", setupDefaultSysctl)
        }

        // For the default bridge, set expected sysctls
        if config.DefaultBridge {
                bridgeSetup.queueStep("setupDefaultSysctl", setupDefaultSysctl)
        }

        // Always set the bridge's MTU if specified. This is purely cosmetic; a bridge's
        // MTU is the min MTU of device connected to it, and MTU will be set on each
        // 'veth'. But, for a non-default MTU, the bridge's MTU will look wrong until a
        // container is attached.
        if config.Mtu > 0 {
                bridgeSetup.queueStep("setupMTU", setupMTU)
        }

        // Module br_netfilter needs to be loaded with net.bridge.bridge-nf-call-ip[6]tables
        // enabled to implement icc=false, or DNAT when hairpin mode is enabled.
        enableBrNfCallIptables := !config.EnableICC || d.config.Hairpin

        // Conditionally queue setup steps depending on configuration values.
        for _, step := range []struct {
                Condition bool
                StepName  string
                StepFn    stepFn
        }{
                // Even if a bridge exists try to setup IPv4.
                {config.EnableIPv4, "setupBridgeIPv4", setupBridgeIPv4},

                // Enable IPv6 on the bridge if required. We do this even for a
                // previously  existing bridge, as it may be here from a previous
                // installation where IPv6 wasn't supported yet and needs to be
                // assigned an IPv6 link-local address.
                {config.EnableIPv6, "setupBridgeIPv6", setupBridgeIPv6},

                // Ensure the bridge has the expected IPv4 addresses in the case of a previously
                // existing device.
                {config.EnableIPv4 && bridgeAlreadyExists && !config.InhibitIPv4, "setupVerifyAndReconcileIPv4", setupVerifyAndReconcileIPv4},

                // Enable IP Forwarding
                {
                        config.EnableIPv4 && d.config.EnableIPForwarding,
                        "setupIPv4Forwarding",
                        func(*networkConfiguration, *bridgeInterface) error {
                                return setupIPv4Forwarding(d.firewaller, d.config.EnableIPTables && !d.config.DisableFilterForwardDrop)
                        },
                },
                {
                        config.EnableIPv6 && d.config.EnableIPForwarding,
                        "setupIPv6Forwarding",
                        func(*networkConfiguration, *bridgeInterface) error {
                                return setupIPv6Forwarding(d.firewaller, d.config.EnableIP6Tables && !d.config.DisableFilterForwardDrop)
                        },
                },

                // Setup Loopback Addresses Routing
                {d.config.Hairpin, "setupLoopbackAddressesRouting", setupLoopbackAddressesRouting},

                // Setup DefaultGatewayIPv4
                {config.DefaultGatewayIPv4 != nil, "setupGatewayIPv4", setupGatewayIPv4},

                // Setup DefaultGatewayIPv6
                {config.DefaultGatewayIPv6 != nil, "setupGatewayIPv6", setupGatewayIPv6},

                // Configure bridge networking filtering if needed and IP tables are enabled
                {enableBrNfCallIptables && d.config.EnableIPTables, "setupIPv4BridgeNetFiltering", setupIPv4BridgeNetFiltering},
                {enableBrNfCallIptables && d.config.EnableIP6Tables, "setupIPv6BridgeNetFiltering", setupIPv6BridgeNetFiltering},
        } {
                if step.Condition {
                        bridgeSetup.queueStep(step.StepName, step.StepFn)
                }
        }

        bridgeSetup.queueStep("addfirewallerNetwork", func(*networkConfiguration, *bridgeInterface) error {
                n, err := network.newFirewallerNetwork(ctx)
                if err != nil {
                        return err
                }
                network.firewallerNetwork = n
                return nil
        })

        // Apply the prepared list of steps, and abort at the first error.
        bridgeSetup.queueStep("setupDeviceUp", setupDeviceUp)

        if v := os.Getenv("DOCKER_TEST_BRIDGE_INIT_ERROR"); v == config.BridgeName {
                bridgeSetup.queueStep("fakeError", func(n *networkConfiguration, b *bridgeInterface) error {
                        return fmt.Errorf("DOCKER_TEST_BRIDGE_INIT_ERROR is %q", v)
                })
        }

        return bridgeSetup.apply(ctx)
}

func (d *driver) DeleteNetwork(nid string) error {
        d.configNetwork.Lock()
        defer d.configNetwork.Unlock()

        return d.deleteNetwork(nid)
}

func (d *driver) deleteNetwork(nid string) error {
        var err error

        // Get network handler and remove it from driver
        d.Lock()
        n, ok := d.networks[nid]
        d.Unlock()

        if !ok {
                // If the network was successfully created by an earlier incarnation of the daemon,
                // but it failed to initialise this time, the network is still in the store (in
                // case whatever caused the failure can be fixed for a future daemon restart). But,
                // it's not in d.networks. To prevent the driver's state from getting out of step
                // with its parent, make sure it's not in the store before reporting that it does
                // not exist.
                if err := d.storeDelete(&networkConfiguration{ID: nid}); err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "error":   err,
                                "network": nid,
                        }).Warnf("Failed to delete network from bridge store")
                }
                return types.InternalMaskableErrorf("network %s does not exist", nid)
        }

        n.Lock()
        config := n.config
        n.Unlock()

        // delete endpoints belong to this network
        for _, ep := range n.endpoints {
                if err := n.releasePorts(ep); err != nil {
                        log.G(context.TODO()).Warn(err)
                }
                if link, err := d.nlh.LinkByName(ep.srcName); err == nil {
                        if err := d.nlh.LinkDel(link); err != nil {
                                log.G(context.TODO()).WithError(err).Errorf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.srcName, ep.id)
                        }
                }

                if err := d.storeDelete(ep); err != nil {
                        log.G(context.TODO()).Warnf("Failed to remove bridge endpoint %.7s from store: %v", ep.id, err)
                }
        }

        d.Lock()
        delete(d.networks, nid)
        d.Unlock()

        // On failure set network handler back in driver, but
        // only if is not already taken over by some other thread
        defer func() {
                if err != nil {
                        d.Lock()
                        if _, ok := d.networks[nid]; !ok {
                                d.networks[nid] = n
                        }
                        d.Unlock()
                }
        }()

        switch config.BridgeIfaceCreator {
        case ifaceCreatedByLibnetwork, ifaceCreatorUnknown:
                // We only delete the bridge if it was created by the bridge driver and
                // it is not the default one (to keep the backward compatible behavior.)
                if !config.DefaultBridge {
                        if err := d.nlh.LinkDel(n.bridge.Link); err != nil {
                                log.G(context.TODO()).Warnf("Failed to remove bridge interface %s on network %s delete: %v", config.BridgeName, nid, err)
                        }
                }
        case ifaceCreatedByUser:
                // Don't delete the bridge interface if it was not created by libnetwork.
        }

        if err := n.firewallerNetwork.DelNetworkLevelRules(context.TODO()); err != nil {
                log.G(context.TODO()).WithError(err).Warnf("Failed to clean iptables rules for bridge network")
        }
        if err := iptables.DelInterfaceFirewalld(n.config.BridgeName); err != nil {
                log.G(context.TODO()).WithError(err).Warnf("Failed to clean firewalld rules for bridge network")
        }

        return d.storeDelete(config)
}

func addToBridge(ctx context.Context, nlh nlwrap.Handle, ifaceName, bridgeName string) error {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".addToBridge", trace.WithAttributes(
                attribute.String("ifaceName", ifaceName),
                attribute.String("bridgeName", bridgeName)))
        defer span.End()

        lnk, err := nlh.LinkByName(ifaceName)
        if err != nil {
                return fmt.Errorf("could not find interface %s: %v", ifaceName, err)
        }
        if err := nlh.LinkSetMaster(lnk, &netlink.Bridge{LinkAttrs: netlink.LinkAttrs{Name: bridgeName}}); err != nil {
                log.G(ctx).WithError(err).Errorf("Failed to add %s to bridge via netlink", ifaceName)
                return err
        }
        return nil
}

func setHairpinMode(nlh nlwrap.Handle, link netlink.Link, enable bool) error {
        err := nlh.LinkSetHairpin(link, enable)
        if err != nil {
                return fmt.Errorf("unable to set hairpin mode on %s via netlink: %v",
                        link.Attrs().Name, err)
        }
        return nil
}

func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo driverapi.InterfaceInfo, _ map[string]interface{}) error {
        if ifInfo == nil {
                return errors.New("invalid interface info passed")
        }

        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".CreateEndpoint", trace.WithAttributes(
                attribute.String("nid", nid),
                attribute.String("eid", eid)))
        defer span.End()

        // Get the network handler and make sure it exists
        d.Lock()
        n, ok := d.networks[nid]
        dconfig := d.config
        d.Unlock()

        if !ok {
                return types.NotFoundErrorf("network %s does not exist", nid)
        }
        if n == nil {
                return driverapi.ErrNoNetwork(nid)
        }

        // Sanity check
        n.Lock()
        if n.id != nid {
                n.Unlock()
                return invalidNetworkIDError(nid)
        }
        n.Unlock()

        // Check if endpoint id is good and retrieve correspondent endpoint
        ep, err := n.getEndpoint(eid)
        if err != nil {
                return err
        }

        // Endpoint with that id exists either on desired or other sandbox
        if ep != nil {
                return driverapi.ErrEndpointExists(eid)
        }

        // Create and add the endpoint
        n.Lock()
        endpoint := &bridgeEndpoint{id: eid, nid: nid}
        n.endpoints[eid] = endpoint
        n.Unlock()

        // On failure make sure to remove the endpoint
        defer func() {
                if err != nil {
                        n.Lock()
                        delete(n.endpoints, eid)
                        n.Unlock()
                }
        }()

        // Generate a name for what will be the host side pipe interface
        hostIfName, err := netutils.GenerateIfaceName(d.nlh, vethPrefix, vethLen)
        if err != nil {
                return err
        }

        // Generate a name for what will be the sandbox side pipe interface
        containerIfName, err := netutils.GenerateIfaceName(d.nlh, vethPrefix, vethLen)
        if err != nil {
                return err
        }

        // Generate and add the interface pipe host <-> sandbox
        nlhSb := d.nlh
        if nlh, err := createVeth(ctx, hostIfName, containerIfName, ifInfo, d.nlh); err != nil {
                return err
        } else if nlh != nil {
                defer nlh.Close()
                nlhSb = *nlh
        }

        // Get the host side pipe interface handler
        host, err := d.nlh.LinkByName(hostIfName)
        if err != nil {
                return types.InternalErrorf("failed to find host side interface %s: %v", hostIfName, err)
        }
        defer func() {
                if err != nil {
                        if err := d.nlh.LinkDel(host); err != nil {
                                log.G(ctx).WithError(err).Warnf("Failed to delete host side interface (%s)'s link", hostIfName)
                        }
                }
        }()

        // Get the sandbox side pipe interface handler
        sbox, err := nlhSb.LinkByName(containerIfName)
        if err != nil {
                return types.InternalErrorf("failed to find sandbox side interface %s: %v", containerIfName, err)
        }
        defer func() {
                if err != nil {
                        if err := nlhSb.LinkDel(sbox); err != nil {
                                log.G(ctx).WithError(err).Warnf("Failed to delete sandbox side interface (%s)'s link", containerIfName)
                        }
                }
        }()

        n.Lock()
        config := n.config
        n.Unlock()

        // Add bridge inherited attributes to pipe interfaces
        if config.Mtu != 0 {
                err = d.nlh.LinkSetMTU(host, config.Mtu)
                if err != nil {
                        return types.InternalErrorf("failed to set MTU on host interface %s: %v", hostIfName, err)
                }
                err = nlhSb.LinkSetMTU(sbox, config.Mtu)
                if err != nil {
                        return types.InternalErrorf("failed to set MTU on sandbox interface %s: %v", containerIfName, err)
                }
        }

        // Attach host side pipe interface into the bridge
        if err = addToBridge(ctx, d.nlh, hostIfName, config.BridgeName); err != nil {
                return fmt.Errorf("adding interface %s to bridge %s failed: %v", hostIfName, config.BridgeName, err)
        }

        if dconfig.Hairpin {
                err = setHairpinMode(d.nlh, host, true)
                if err != nil {
                        return err
                }
        }

        // Store the sandbox side pipe interface parameters
        endpoint.srcName = containerIfName
        endpoint.macAddress = ifInfo.MacAddress()
        endpoint.addr = ifInfo.Address()
        endpoint.addrv6 = ifInfo.AddressIPv6()

        if endpoint.macAddress == nil {
                endpoint.macAddress = netutils.GenerateRandomMAC()
                if err := ifInfo.SetMacAddress(endpoint.macAddress); err != nil {
                        return err
                }
        }

        netip4, netip6 := endpoint.netipAddrs()
        if err := n.firewallerNetwork.AddEndpoint(ctx, netip4, netip6); err != nil {
                return err
        }

        // Up the host interface after finishing all netlink configuration
        if err = d.linkUp(ctx, host); err != nil {
                return fmt.Errorf("could not set link up for host interface %s: %v", hostIfName, err)
        }
        log.G(ctx).WithFields(log.Fields{
                "hostifname": host.Attrs().Name,
                "ifi":        host.Attrs().Index,
        }).Debug("bridge endpoint host link is up")

        if err = d.storeUpdate(ctx, endpoint); err != nil {
                return fmt.Errorf("failed to save bridge endpoint %.7s to store: %v", endpoint.id, err)
        }

        return nil
}

// netipAddrs converts ep.addr and ep.addrv6 from net.IPNet to netip.Addr. If an address
// is non-nil, it's assumed to be valid.
func (ep *bridgeEndpoint) netipAddrs() (v4, v6 netip.Addr) {
        if ep.addr != nil {
                v4, _ = netip.AddrFromSlice(ep.addr.IP)
        }
        if ep.addrv6 != nil {
                v6, _ = netip.AddrFromSlice(ep.addrv6.IP)
        }
        return v4, v6
}

// createVeth creates a veth device with one end in the container's network namespace,
// if it can get hold of the netns path and open the handles. In that case, it returns
// a netlink handle in the container's namespace that must be closed by the caller.
//
// If the netns path isn't available, possibly because the netns hasn't been created
// yet, or it's not possible to get a netns or netlink handle in the container's
// namespace - both ends of the veth device are created in nlh's netns, and no netlink
// handle is returned.
//
// (Only the error from creating the interface is returned. Failure to create the
// interface in the container's netns is not an error.)
func createVeth(ctx context.Context, hostIfName, containerIfName string, ifInfo driverapi.InterfaceInfo, nlh nlwrap.Handle) (nlhCtr *nlwrap.Handle, retErr error) {
        veth := &netlink.Veth{
                LinkAttrs: netlink.LinkAttrs{Name: hostIfName, TxQLen: 0},
                PeerName:  containerIfName,
        }

        if nspath := ifInfo.NetnsPath(); nspath == "" {
                log.G(ctx).WithField("ifname", containerIfName).Debug("No container netns path, creating interface in host netns")
        } else if netnsh, err := netns.GetFromPath(nspath); err != nil {
                log.G(ctx).WithFields(log.Fields{
                        "error":  err,
                        "netns":  nspath,
                        "ifname": containerIfName,
                }).Warn("No container netns, creating interface in host netns")
        } else {
                defer netnsh.Close()

                if nh, err := nlwrap.NewHandleAt(netnsh, syscall.NETLINK_ROUTE); err != nil {
                        log.G(ctx).WithFields(log.Fields{
                                "error": err,
                                "netns": nspath,
                        }).Warn("No netlink handle for container, creating interface in host netns")
                } else {
                        defer func() {
                                if retErr != nil {
                                        nh.Close()
                                }
                        }()

                        veth.PeerNamespace = netlink.NsFd(netnsh)
                        nlhCtr = &nh
                        ifInfo.SetCreatedInContainer(true)
                }
        }

        if err := nlh.LinkAdd(veth); err != nil {
                return nil, types.InternalErrorf("failed to add the host (%s) <=> sandbox (%s) pair interfaces: %v", hostIfName, containerIfName, err)
        }
        return nlhCtr, nil
}

func (d *driver) linkUp(ctx context.Context, host netlink.Link) error {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".linkUp", trace.WithAttributes(
                attribute.String("host", host.Attrs().Name)))
        defer span.End()

        return d.nlh.LinkSetUp(host)
}

func (d *driver) DeleteEndpoint(nid, eid string) error {
        var err error

        // Get the network handler and make sure it exists
        d.Lock()
        n, ok := d.networks[nid]
        d.Unlock()

        if !ok {
                return types.InternalMaskableErrorf("network %s does not exist", nid)
        }
        if n == nil {
                return driverapi.ErrNoNetwork(nid)
        }

        // Sanity Check
        n.Lock()
        if n.id != nid {
                n.Unlock()
                return invalidNetworkIDError(nid)
        }
        n.Unlock()

        // Check endpoint id and if an endpoint is actually there
        ep, err := n.getEndpoint(eid)
        if err != nil {
                return err
        }
        if ep == nil {
                return endpointNotFoundError(eid)
        }

        netip4, netip6 := ep.netipAddrs()
        if err := n.firewallerNetwork.DelEndpoint(context.TODO(), netip4, netip6); err != nil {
                return err
        }

        // Remove it
        n.Lock()
        delete(n.endpoints, eid)
        n.Unlock()

        // On failure make sure to set back ep in n.endpoints, but only
        // if it hasn't been taken over already by some other thread.
        defer func() {
                if err != nil {
                        n.Lock()
                        if _, ok := n.endpoints[eid]; !ok {
                                n.endpoints[eid] = ep
                        }
                        n.Unlock()
                }
        }()

        // Try removal of link. Discard error: it is a best effort.
        // Also make sure defer does not see this error either.
        if link, err := d.nlh.LinkByName(ep.srcName); err == nil {
                if err := d.nlh.LinkDel(link); err != nil {
                        log.G(context.TODO()).WithError(err).Errorf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.srcName, ep.id)
                }
        }

        if err := d.storeDelete(ep); err != nil {
                log.G(context.TODO()).Warnf("Failed to remove bridge endpoint %.7s from store: %v", ep.id, err)
        }

        return nil
}

func (d *driver) EndpointOperInfo(nid, eid string) (map[string]interface{}, error) {
        // Get the network handler and make sure it exists
        d.Lock()
        n, ok := d.networks[nid]
        d.Unlock()
        if !ok {
                return nil, types.NotFoundErrorf("network %s does not exist", nid)
        }
        if n == nil {
                return nil, driverapi.ErrNoNetwork(nid)
        }

        // Sanity check
        n.Lock()
        if n.id != nid {
                n.Unlock()
                return nil, invalidNetworkIDError(nid)
        }
        n.Unlock()

        // Check if endpoint id is good and retrieve correspondent endpoint
        ep, err := n.getEndpoint(eid)
        if err != nil {
                return nil, err
        }
        if ep == nil {
                return nil, driverapi.ErrNoEndpoint(eid)
        }

        m := make(map[string]interface{})

        if ep.extConnConfig != nil && ep.extConnConfig.ExposedPorts != nil {
                // Return a copy of the config data
                epc := make([]types.TransportPort, 0, len(ep.extConnConfig.ExposedPorts))
                for _, tp := range ep.extConnConfig.ExposedPorts {
                        epc = append(epc, tp.GetCopy())
                }
                m[netlabel.ExposedPorts] = epc
        }

        if ep.portMapping != nil {
                // Return a copy of the operational data
                pmc := make([]types.PortBinding, 0, len(ep.portMapping))
                for _, pm := range ep.portMapping {
                        pmc = append(pmc, pm.PortBinding.GetCopy())
                }
                m[netlabel.PortMap] = pmc
        }

        if len(ep.macAddress) != 0 {
                m[netlabel.MacAddress] = ep.macAddress
        }

        return m, nil
}

// Join method is invoked when a Sandbox is attached to an endpoint.
func (d *driver) Join(ctx context.Context, nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, epOpts, sbOpts map[string]interface{}) error {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".Join", trace.WithAttributes(
                attribute.String("nid", nid),
                attribute.String("eid", eid),
                attribute.String("sboxKey", sboxKey)))
        defer span.End()

        network, err := d.getNetwork(nid)
        if err != nil {
                return err
        }

        endpoint, err := network.getEndpoint(eid)
        if err != nil {
                return err
        }

        if endpoint == nil {
                return endpointNotFoundError(eid)
        }

        endpoint.containerConfig, err = parseContainerOptions(sbOpts)
        if err != nil {
                return err
        }
        endpoint.extConnConfig, err = parseConnectivityOptions(sbOpts)
        if err != nil {
                return err
        }

        iNames := jinfo.InterfaceName()
        containerVethPrefix := defaultContainerVethPrefix
        if network.config.ContainerIfacePrefix != "" {
                containerVethPrefix = network.config.ContainerIfacePrefix
        }
        if err := iNames.SetNames(endpoint.srcName, containerVethPrefix, netlabel.GetIfname(epOpts)); err != nil {
                return err
        }

        if !network.config.Internal {
                if err := jinfo.SetGateway(network.bridge.gatewayIPv4); err != nil {
                        return err
                }
                if err := jinfo.SetGatewayIPv6(network.bridge.gatewayIPv6); err != nil {
                        return err
                }
        }

        if !network.config.EnableICC {
                return d.link(network, endpoint, true)
        }

        return nil
}

// Leave method is invoked when a Sandbox detaches from an endpoint.
func (d *driver) Leave(nid, eid string) error {
        network, err := d.getNetwork(nid)
        if err != nil {
                return types.InternalMaskableErrorf("%v", err)
        }

        endpoint, err := network.getEndpoint(eid)
        if err != nil {
                return err
        }

        if endpoint == nil {
                return endpointNotFoundError(eid)
        }

        if !network.config.EnableICC {
                if err = d.link(network, endpoint, false); err != nil {
                        return err
                }
        }

        return nil
}

type portBindingMode struct {
        ipv4 bool
        ipv6 bool
}

func (d *driver) ProgramExternalConnectivity(ctx context.Context, nid, eid string, gw4Id, gw6Id string) (retErr error) {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".ProgramExternalConnectivity", trace.WithAttributes(
                attribute.String("nid", nid),
                attribute.String("eid", eid),
                attribute.String("gw4", gw4Id),
                attribute.String("gw6", gw6Id)))
        defer span.End()

        // Make sure the network isn't deleted, or in the middle of a firewalld reload, while
        // updating its iptables rules.
        d.configNetwork.Lock()
        defer d.configNetwork.Unlock()

        network, err := d.getNetwork(nid)
        if err != nil {
                return err
        }

        endpoint, err := network.getEndpoint(eid)
        if err != nil {
                return err
        }

        if endpoint == nil {
                return endpointNotFoundError(eid)
        }

        var pbmReq portBindingMode
        // Act as the IPv4 gateway if explicitly selected.
        if gw4Id == eid {
                pbmReq.ipv4 = true
        }
        // Act as the IPv6 gateway if explicitly selected - or if there's no IPv6
        // gateway, but this endpoint is the IPv4 gateway (in which case, the userland
        // proxy may proxy between host v6 and container v4 addresses.)
        if gw6Id == eid || (gw6Id == "" && gw4Id == eid) {
                pbmReq.ipv6 = true
        }

        // If no change is needed, return.
        if endpoint.portBindingState == pbmReq {
                return nil
        }

        // Remove port bindings that aren't needed due to a change in mode.
        undoTrim, err := endpoint.trimPortBindings(ctx, network, pbmReq)
        if err != nil {
                return err
        }
        defer func() {
                if retErr != nil && undoTrim != nil {
                        endpoint.portMapping = append(endpoint.portMapping, undoTrim()...)
                }
        }()

        // Set up new port bindings, and store them in the endpoint.
        if (pbmReq.ipv4 || pbmReq.ipv6) && endpoint.extConnConfig != nil && endpoint.extConnConfig.PortBindings != nil {
                newPMs, err := network.addPortMappings(ctx, endpoint, endpoint.extConnConfig.PortBindings, network.config.DefaultBindingIP, pbmReq)
                if err != nil {
                        return err
                }
                endpoint.portMapping = append(endpoint.portMapping, newPMs...)
        }

        // Remember the new port binding state.
        endpoint.portBindingState = pbmReq

        // Clean the connection tracker state of the host for the specific endpoint. This is needed because some flows may
        // be bound to the local proxy, or to the host (for UDP packets), and won't be redirected to the new endpoints.
        clearConntrackEntries(d.nlh, endpoint)

        if err = d.storeUpdate(ctx, endpoint); err != nil {
                return fmt.Errorf("failed to update bridge endpoint %.7s to store: %v", endpoint.id, err)
        }

        return nil
}

// trimPortBindings compares pbmReq with the current port bindings, and removes
// port bindings that are no longer required.
//
// ep.portMapping is updated when bindings are removed.
func (ep *bridgeEndpoint) trimPortBindings(ctx context.Context, n *bridgeNetwork, pbmReq portBindingMode) (func() []portmapperapi.PortBinding, error) {
        // If the endpoint is the gateway for IPv4 and IPv6, there's nothing to drop.
        if pbmReq.ipv4 && pbmReq.ipv6 {
                return nil, nil
        }

        toDrop := make([]portmapperapi.PortBinding, 0, len(ep.portMapping))
        toKeep := slices.DeleteFunc(ep.portMapping, func(pb portmapperapi.PortBinding) bool {
                is4 := pb.HostIP.To4() != nil
                if (is4 && !pbmReq.ipv4) || (!is4 && !pbmReq.ipv6) {
                        toDrop = append(toDrop, pb)
                        return true
                }
                return false
        })
        if len(toDrop) == 0 {
                return nil, nil
        }

        if err := n.unmapPBs(ctx, toDrop); err != nil {
                log.G(ctx).WithFields(log.Fields{
                        "error": err,
                        "gw4":   pbmReq.ipv4,
                        "gw6":   pbmReq.ipv6,
                        "nid":   stringid.TruncateID(n.id),
                        "eid":   stringid.TruncateID(ep.id),
                }).Error("Failed to release port bindings")
                return nil, err
        }
        ep.portMapping = toKeep

        undo := func() []portmapperapi.PortBinding {
                pbReq := make([]portmapperapi.PortBindingReq, 0, len(toDrop))
                for _, pb := range toDrop {
                        pbReq = append(pbReq, portmapperapi.PortBindingReq{PortBinding: pb.GetCopy()})
                }
                pbs, err := n.addPortMappings(ctx, ep, pbReq, n.config.DefaultBindingIP, ep.portBindingState)
                if err != nil {
                        log.G(ctx).WithFields(log.Fields{
                                "error": err,
                                "nid":   stringid.TruncateID(n.id),
                                "eid":   stringid.TruncateID(ep.id),
                        }).Error("Failed to restore port bindings following join failure")
                        return nil
                }
                return pbs
        }

        return undo, nil
}

// clearConntrackEntries flushes conntrack entries matching endpoint IP address
// or matching one of the exposed UDP port.
// In the first case, this could happen if packets were received by the host
// between userland proxy startup and iptables setup.
// In the latter case, this could happen if packets were received whereas there
// were nowhere to route them, as netfilter creates entries in such case.
// This is required because iptables NAT rules are evaluated by netfilter only
// when creating a new conntrack entry. When Docker latter adds NAT rules,
// netfilter ignore them for any packet matching a pre-existing conntrack entry.
// As such, we need to flush all those conntrack entries to make sure NAT rules
// are correctly applied to all packets.
// See: #8795, #44688 & #44742.
func clearConntrackEntries(nlh nlwrap.Handle, ep *bridgeEndpoint) {
        var ipv4List []net.IP
        var ipv6List []net.IP
        var udpPorts []uint16

        if ep.addr != nil {
                ipv4List = append(ipv4List, ep.addr.IP)
        }
        if ep.addrv6 != nil {
                ipv6List = append(ipv6List, ep.addrv6.IP)
        }
        for _, pb := range ep.portMapping {
                if pb.Proto == types.UDP {
                        udpPorts = append(udpPorts, pb.HostPort)
                }
        }

        iptables.DeleteConntrackEntries(nlh, ipv4List, ipv6List)
        iptables.DeleteConntrackEntriesByPort(nlh, types.UDP, udpPorts)
}

func (d *driver) handleFirewalldReload() {
        if !d.config.EnableIPTables && !d.config.EnableIP6Tables {
                return
        }

        d.Lock()
        nids := make([]string, 0, len(d.networks))
        for _, nw := range d.networks {
                nids = append(nids, nw.id)
        }
        d.Unlock()

        for _, nid := range nids {
                d.handleFirewalldReloadNw(nid)
        }
}

func (d *driver) handleFirewalldReloadNw(nid string) {
        d.Lock()
        defer d.Unlock()

        if !d.config.EnableIPTables && !d.config.EnableIP6Tables {
                return
        }

        // Make sure the network isn't being deleted, and ProgramExternalConnectivity
        // isn't modifying iptables rules, while restoring the rules.
        d.configNetwork.Lock()
        defer d.configNetwork.Unlock()

        nw, ok := d.networks[nid]
        if !ok {
                // Network deleted since the reload started, not an error.
                return
        }

        if err := nw.firewallerNetwork.ReapplyNetworkLevelRules(context.TODO()); err != nil {
                log.G(context.Background()).WithFields(log.Fields{
                        "nid":   nw.id,
                        "error": err,
                }).Error("Failed to re-create packet filter on firewalld reload")
        }

        // Re-add legacy links - only added during ProgramExternalConnectivity, but legacy
        // links are default-bridge-only, and it's not possible to connect a container to
        // the default bridge and a user-defined network. So, the default bridge is always
        // the gateway and, if there are legacy links configured they need to be set up.
        if !nw.config.EnableICC {
                for _, ep := range nw.endpoints {
                        if err := d.link(nw, ep, true); err != nil {
                                log.G(context.Background()).WithFields(log.Fields{
                                        "nid":   nw.id,
                                        "eid":   ep.id,
                                        "error": err,
                                }).Error("Failed to re-create link on firewalld reload")
                        }
                }
        }

        // Set up per-port rules. These are also only set up during ProgramExternalConnectivity
        // but the network's port bindings are only configured when it's acting as the
        // gateway network. So, this is a no-op for networks that aren't providing endpoints
        // with the gateway.
        nw.reapplyPerPortIptables()

        if err := iptables.AddInterfaceFirewalld(nw.config.BridgeName); err != nil {
                log.G(context.Background()).WithFields(log.Fields{
                        "error":  err,
                        "nid":    nw.id,
                        "bridge": nw.config.BridgeName,
                }).Error("Failed to add interface to docker zone on firewalld reload")
        }
}

func LegacyContainerLinkOptions(parentEndpoints, childEndpoints []string) map[string]interface{} {
        return options.Generic{
                netlabel.GenericData: options.Generic{
                        "ParentEndpoints": parentEndpoints,
                        "ChildEndpoints":  childEndpoints,
                },
        }
}

func (d *driver) link(network *bridgeNetwork, endpoint *bridgeEndpoint, enable bool) (retErr error) {
        cc := endpoint.containerConfig
        ec := endpoint.extConnConfig
        if cc == nil || ec == nil || (len(cc.ParentEndpoints) == 0 && len(cc.ChildEndpoints) == 0) {
                // nothing to do
                return nil
        }

        // Try to keep things atomic when adding - if there's an error, recurse with enable=false
        // to delete everything that might have been created.
        if enable {
                defer func() {
                        if retErr != nil {
                                d.link(network, endpoint, false)
                        }
                }()
        }

        if ec.ExposedPorts != nil {
                for _, p := range cc.ParentEndpoints {
                        parentEndpoint, err := network.getEndpoint(p)
                        if err != nil {
                                return err
                        }
                        if parentEndpoint == nil {
                                return invalidEndpointIDError(p)
                        }
                        parentAddr, ok := netip.AddrFromSlice(parentEndpoint.addr.IP)
                        if !ok {
                                return fmt.Errorf("invalid parent endpoint IP: %s", parentEndpoint.addr.IP)
                        }
                        childAddr, ok := netip.AddrFromSlice(endpoint.addr.IP)
                        if !ok {
                                return fmt.Errorf("invalid parent endpoint IP: %s", endpoint.addr.IP)
                        }

                        if enable {
                                if err := network.firewallerNetwork.AddLink(context.TODO(), parentAddr, childAddr, ec.ExposedPorts); err != nil {
                                        return err
                                }
                        } else {
                                network.firewallerNetwork.DelLink(context.TODO(), parentAddr, childAddr, ec.ExposedPorts)
                        }
                }
        }

        for _, c := range cc.ChildEndpoints {
                childEndpoint, err := network.getEndpoint(c)
                if err != nil {
                        return err
                }
                if childEndpoint == nil {
                        return invalidEndpointIDError(c)
                }
                if childEndpoint.extConnConfig == nil || childEndpoint.extConnConfig.ExposedPorts == nil {
                        continue
                }
                parentAddr, ok := netip.AddrFromSlice(endpoint.addr.IP)
                if !ok {
                        return fmt.Errorf("invalid parent endpoint IP: %s", endpoint.addr.IP)
                }
                childAddr, ok := netip.AddrFromSlice(childEndpoint.addr.IP)
                if !ok {
                        return fmt.Errorf("invalid parent endpoint IP: %s", childEndpoint.addr.IP)
                }

                if enable {
                        if err := network.firewallerNetwork.AddLink(context.TODO(), parentAddr, childAddr, childEndpoint.extConnConfig.ExposedPorts); err != nil {
                                return err
                        }
                } else {
                        network.firewallerNetwork.DelLink(context.TODO(), parentAddr, childAddr, childEndpoint.extConnConfig.ExposedPorts)
                }
        }

        return nil
}

func (d *driver) Type() string {
        return NetworkType
}

func (d *driver) IsBuiltIn() bool {
        return true
}

func parseContainerOptions(cOptions map[string]interface{}) (*containerConfiguration, error) {
        if cOptions == nil {
                return nil, nil
        }
        genericData := cOptions[netlabel.GenericData]
        if genericData == nil {
                return nil, nil
        }
        switch opt := genericData.(type) {
        case options.Generic:
                opaqueConfig, err := options.GenerateFromModel(opt, &containerConfiguration{})
                if err != nil {
                        return nil, err
                }
                return opaqueConfig.(*containerConfiguration), nil
        case *containerConfiguration:
                return opt, nil
        default:
                return nil, nil
        }
}

func parseConnectivityOptions(cOptions map[string]interface{}) (*connectivityConfiguration, error) {
        if cOptions == nil {
                return nil, nil
        }

        cc := &connectivityConfiguration{}

        if opt, ok := cOptions[netlabel.PortMap]; ok {
                if pbs, ok := opt.([]types.PortBinding); ok {
                        cc.PortBindings = sliceutil.Map(pbs, func(pb types.PortBinding) portmapperapi.PortBindingReq {
                                return portmapperapi.PortBindingReq{
                                        PortBinding: pb.GetCopy(),
                                }
                        })
                } else {
                        return nil, types.InvalidParameterErrorf("invalid port mapping data in connectivity configuration: %v", opt)
                }
        }

        if opt, ok := cOptions[netlabel.ExposedPorts]; ok {
                if ports, ok := opt.([]types.TransportPort); ok {
                        cc.ExposedPorts = ports
                } else {
                        return nil, types.InvalidParameterErrorf("invalid exposed ports data in connectivity configuration: %v", opt)
                }
        }

        return cc, nil
}

//go:build linux

package bridge

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/portmapperapi"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/otelutil"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/baggage"
        "go.opentelemetry.io/otel/trace"
)

const (
        // network config prefix was not specific enough.
        // To be backward compatible, need custom endpoint
        // prefix with different root
        bridgePrefix         = "bridge"
        bridgeEndpointPrefix = "bridge-endpoint"
)

func (d *driver) initStore() error {
        err := d.populateNetworks()
        if err != nil {
                return err
        }

        err = d.populateEndpoints()
        if err != nil {
                return err
        }

        // If there's a firewall cleaner, it's done its job by cleaning up rules
        // belonging to the restored networks. So, drop it.
        if fcs, ok := d.firewaller.(firewaller.FirewallCleanerSetter); ok {
                fcs.SetFirewallCleaner(nil)
        }

        return nil
}

func (d *driver) populateNetworks() error {
        kvol, err := d.store.List(&networkConfiguration{})
        if err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                return fmt.Errorf("failed to get bridge network configurations from store: %w", err)
        }

        // It's normal for network configuration state to be empty. Just return.
        if errors.Is(err, datastore.ErrKeyNotFound) {
                return nil
        }

        ctx := baggage.ContextWithBaggage(context.TODO(), otelutil.MustNewBaggage(
                otelutil.MustNewMemberRaw(otelutil.TriggerKey, spanPrefix+".initStore"),
        ))
        for _, kvo := range kvol {
                ncfg := kvo.(*networkConfiguration)
                if err = d.createNetwork(ctx, ncfg); err != nil {
                        log.G(context.TODO()).Warnf("could not create bridge network for id %s bridge name %s while booting up from persistent state: %v", ncfg.ID, ncfg.BridgeName, err)
                }
                log.G(context.TODO()).Debugf("Network (%.7s) restored", ncfg.ID)
        }

        return nil
}

func (d *driver) populateEndpoints() error {
        kvol, err := d.store.List(&bridgeEndpoint{})
        if err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                return fmt.Errorf("failed to get bridge endpoints from store: %w", err)
        }

        if errors.Is(err, datastore.ErrKeyNotFound) {
                return nil
        }

        for _, kvo := range kvol {
                ep := kvo.(*bridgeEndpoint)
                n, ok := d.networks[ep.nid]
                if !ok {
                        log.G(context.TODO()).Debugf("Network (%.7s) not found for restored bridge endpoint (%.7s)", ep.nid, ep.id)
                        log.G(context.TODO()).Debugf("Deleting stale bridge endpoint (%.7s) from store", ep.id)
                        if err := d.storeDelete(ep); err != nil {
                                log.G(context.TODO()).Debugf("Failed to delete stale bridge endpoint (%.7s) from store", ep.id)
                        }
                        continue
                }
                n.endpoints[ep.id] = ep
                netip4, netip6 := ep.netipAddrs()
                if err := n.firewallerNetwork.AddEndpoint(context.TODO(), netip4, netip6); err != nil {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "error": err,
                                "ep.id": ep.id,
                        }).Warn("Failed to restore per-endpoint firewall rules")
                }
                n.restorePortAllocations(ep)
                log.G(context.TODO()).Debugf("Endpoint (%.7s) restored to network (%.7s)", ep.id, ep.nid)
        }

        return nil
}

func (d *driver) storeUpdate(ctx context.Context, kvObject datastore.KVObject) error {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".storeUpdate", trace.WithAttributes(
                attribute.String("kvObject", fmt.Sprintf("%+v", kvObject.Key()))))
        defer span.End()

        if d.store == nil {
                log.G(ctx).Warnf("bridge store not initialized. kv object %s is not added to the store", datastore.Key(kvObject.Key()...))
                return nil
        }

        if err := d.store.PutObjectAtomic(kvObject); err != nil {
                return fmt.Errorf("failed to update bridge store for object type %T: %v", kvObject, err)
        }

        return nil
}

func (d *driver) storeDelete(kvObject datastore.KVObject) error {
        if d.store == nil {
                log.G(context.TODO()).Debugf("bridge store not initialized. kv object %s is not deleted from store", datastore.Key(kvObject.Key()...))
                return nil
        }

        return d.store.DeleteObject(kvObject)
}

func (ncfg *networkConfiguration) MarshalJSON() ([]byte, error) {
        nMap := make(map[string]interface{})
        nMap["ID"] = ncfg.ID
        nMap["BridgeName"] = ncfg.BridgeName
        nMap["EnableIPv4"] = ncfg.EnableIPv4
        nMap["EnableIPv6"] = ncfg.EnableIPv6
        nMap["EnableIPMasquerade"] = ncfg.EnableIPMasquerade
        nMap["GwModeIPv4"] = ncfg.GwModeIPv4
        nMap["GwModeIPv6"] = ncfg.GwModeIPv6
        nMap["EnableICC"] = ncfg.EnableICC
        nMap["TrustedHostInterfaces"] = strings.Join(ncfg.TrustedHostInterfaces, ":")
        nMap["InhibitIPv4"] = ncfg.InhibitIPv4
        nMap["Mtu"] = ncfg.Mtu
        nMap["Internal"] = ncfg.Internal
        nMap["DefaultBridge"] = ncfg.DefaultBridge
        nMap["DefaultBindingIP"] = ncfg.DefaultBindingIP.String()
        // This key is "HostIP" instead of "HostIPv4" to preserve compatibility with the on-disk format.
        nMap["HostIP"] = ncfg.HostIPv4.String()
        nMap["HostIPv6"] = ncfg.HostIPv6.String()
        nMap["DefaultGatewayIPv4"] = ncfg.DefaultGatewayIPv4.String()
        nMap["DefaultGatewayIPv6"] = ncfg.DefaultGatewayIPv6.String()
        nMap["ContainerIfacePrefix"] = ncfg.ContainerIfacePrefix
        nMap["BridgeIfaceCreator"] = ncfg.BridgeIfaceCreator

        if ncfg.AddressIPv4 != nil {
                nMap["AddressIPv4"] = ncfg.AddressIPv4.String()
        }

        if ncfg.AddressIPv6 != nil {
                nMap["AddressIPv6"] = ncfg.AddressIPv6.String()
        }

        return json.Marshal(nMap)
}

func (ncfg *networkConfiguration) UnmarshalJSON(b []byte) error {
        var (
                err  error
                nMap map[string]interface{}
        )

        if err = json.Unmarshal(b, &nMap); err != nil {
                return err
        }

        if v, ok := nMap["AddressIPv4"]; ok {
                if ncfg.AddressIPv4, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode bridge network address IPv4 after json unmarshal: %s", v.(string))
                }
                // For networks created before EnableIPv4 was added ...
                ncfg.EnableIPv4 = true
        }

        if v, ok := nMap["AddressIPv6"]; ok {
                if ncfg.AddressIPv6, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode bridge network address IPv6 after json unmarshal: %s", v.(string))
                }
        }

        if v, ok := nMap["ContainerIfacePrefix"]; ok {
                ncfg.ContainerIfacePrefix = v.(string)
        }

        // This key is "HostIP" instead of "HostIPv4" to preserve compatibility with the on-disk format.
        if v, ok := nMap["HostIP"]; ok {
                ncfg.HostIPv4 = net.ParseIP(v.(string))
        }
        if v, ok := nMap["HostIPv6"]; ok {
                ncfg.HostIPv6 = net.ParseIP(v.(string))
        }

        ncfg.DefaultBridge = nMap["DefaultBridge"].(bool)
        ncfg.DefaultBindingIP = net.ParseIP(nMap["DefaultBindingIP"].(string))
        ncfg.DefaultGatewayIPv4 = net.ParseIP(nMap["DefaultGatewayIPv4"].(string))
        ncfg.DefaultGatewayIPv6 = net.ParseIP(nMap["DefaultGatewayIPv6"].(string))
        ncfg.ID = nMap["ID"].(string)
        ncfg.BridgeName = nMap["BridgeName"].(string)
        if v, ok := nMap["EnableIPv4"]; ok {
                ncfg.EnableIPv4 = v.(bool)
        }
        ncfg.EnableIPv6 = nMap["EnableIPv6"].(bool)
        ncfg.EnableIPMasquerade = nMap["EnableIPMasquerade"].(bool)
        if v, ok := nMap["GwModeIPv4"]; ok {
                ncfg.GwModeIPv4, _ = newGwMode(v.(string))
        }
        if v, ok := nMap["GwModeIPv6"]; ok {
                ncfg.GwModeIPv6, _ = newGwMode(v.(string))
        }
        ncfg.EnableICC = nMap["EnableICC"].(bool)
        if v, ok := nMap["TrustedHostInterfaces"]; ok {
                s, _ := v.(string)
                ncfg.TrustedHostInterfaces = strings.FieldsFunc(s, func(r rune) bool { return r == ':' })
        }
        if v, ok := nMap["InhibitIPv4"]; ok {
                ncfg.InhibitIPv4 = v.(bool)
        }

        ncfg.Mtu = int(nMap["Mtu"].(float64))
        if v, ok := nMap["Internal"]; ok {
                ncfg.Internal = v.(bool)
        }

        if v, ok := nMap["BridgeIfaceCreator"]; ok {
                ncfg.BridgeIfaceCreator = ifaceCreator(v.(float64))
        }

        return nil
}

func (ncfg *networkConfiguration) Key() []string {
        return []string{bridgePrefix, ncfg.ID}
}

func (ncfg *networkConfiguration) KeyPrefix() []string {
        return []string{bridgePrefix}
}

func (ncfg *networkConfiguration) Value() []byte {
        b, err := json.Marshal(ncfg)
        if err != nil {
                return nil
        }
        return b
}

func (ncfg *networkConfiguration) SetValue(value []byte) error {
        return json.Unmarshal(value, ncfg)
}

func (ncfg *networkConfiguration) Index() uint64 {
        return ncfg.dbIndex
}

func (ncfg *networkConfiguration) SetIndex(index uint64) {
        ncfg.dbIndex = index
        ncfg.dbExists = true
}

func (ncfg *networkConfiguration) Exists() bool {
        return ncfg.dbExists
}

func (ncfg *networkConfiguration) Skip() bool {
        return false
}

func (ncfg *networkConfiguration) New() datastore.KVObject {
        return &networkConfiguration{}
}

func (ncfg *networkConfiguration) CopyTo(o datastore.KVObject) error {
        dstNcfg := o.(*networkConfiguration)
        *dstNcfg = *ncfg
        return nil
}

func (ep *bridgeEndpoint) MarshalJSON() ([]byte, error) {
        epMap := make(map[string]interface{})
        epMap["id"] = ep.id
        epMap["nid"] = ep.nid
        epMap["SrcName"] = ep.srcName
        epMap["MacAddress"] = ep.macAddress.String()
        if ep.addr != nil {
                epMap["Addr"] = ep.addr.String()
        }
        if ep.addrv6 != nil {
                epMap["Addrv6"] = ep.addrv6.String()
        }
        epMap["ContainerConfig"] = ep.containerConfig
        epMap["ExternalConnConfig"] = ep.extConnConfig
        epMap["PortMapping"] = ep.portMapping

        return json.Marshal(epMap)
}

func (ep *bridgeEndpoint) UnmarshalJSON(b []byte) error {
        var (
                err   error
                epMap map[string]interface{}
        )

        if err = json.Unmarshal(b, &epMap); err != nil {
                return fmt.Errorf("Failed to unmarshal to bridge endpoint: %v", err)
        }

        if v, ok := epMap["MacAddress"]; ok {
                if ep.macAddress, err = net.ParseMAC(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode bridge endpoint MAC address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        if v, ok := epMap["Addr"]; ok {
                if ep.addr, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode bridge endpoint IPv4 address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        if v, ok := epMap["Addrv6"]; ok {
                if ep.addrv6, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode bridge endpoint IPv6 address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        ep.id = epMap["id"].(string)
        ep.nid = epMap["nid"].(string)
        ep.srcName = epMap["SrcName"].(string)

        // TODO(thaJeztah): these nullify ep.containerConfig, ep.extConnConfig, and ep.portMapping
        //  if "ContainerConfig", "ExternalConnConfig", or "PortMapping" are not present or invalid
        //  is this the intent? Or should this leave values untouched if present but invalid?
        //
        // Alternatively, it could be checking if the value is set (and not nil), otherwise
        // explicitly nullify like (or variants of);
        //
        //  if c, ok := epMap["ContainerConfig"]; ok && c != nil {
        //          if d, err := json.Marshal(c); err != nil {
        //                  log.G(context.TODO()).Warnf("Failed to encode endpoint container config %v", err)
        //          } else if err := json.Unmarshal(d, &ep.containerConfig); err != nil {
        //                  log.G(context.TODO()).Warnf("Failed to decode endpoint container config %v", err)
        //          }
        //  } else {
        //          ep.containerConfig = nil
        //  }

        d, err := json.Marshal(epMap["ContainerConfig"])
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to encode endpoint container config %v", err)
        }
        if err := json.Unmarshal(d, &ep.containerConfig); err != nil {
                log.G(context.TODO()).Warnf("Failed to decode endpoint container config %v", err)
        }

        d, err = json.Marshal(epMap["ExternalConnConfig"])
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to encode endpoint external connectivity configuration %v", err)
        }
        if err := json.Unmarshal(d, &ep.extConnConfig); err != nil {
                log.G(context.TODO()).Warnf("Failed to decode endpoint external connectivity configuration %v", err)
        }

        d, err = json.Marshal(epMap["PortMapping"])
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to encode endpoint port mapping %v", err)
        }
        if err := json.Unmarshal(d, &ep.portMapping); err != nil {
                log.G(context.TODO()).Warnf("Failed to decode endpoint port mapping %v", err)
        }

        // Until release 27.0, HostPortEnd in PortMapping (operational data) was left at
        // the value it had in ExternalConnConfig.PortBindings (configuration). So, for
        // example, if the configured host port range was 8000-8009 and the allocated
        // port was 8004, the stored range was 8004-8009. Also, if allocation for an
        // explicit (non-ephemeral) range failed because some other process had a port
        // bound, there was no attempt to retry (because HostPort!=0). Now that's fixed,
        // on live-restore we don't want to allocate different ports - so, remove the range
        // from the operational data.
        // TODO(robmry) - remove once direct upgrade from moby 26.x is no longer supported.
        for i := range ep.portMapping {
                ep.portMapping[i].HostPortEnd = ep.portMapping[i].HostPort
        }

        return nil
}

func (ep *bridgeEndpoint) Key() []string {
        return []string{bridgeEndpointPrefix, ep.id}
}

func (ep *bridgeEndpoint) KeyPrefix() []string {
        return []string{bridgeEndpointPrefix}
}

func (ep *bridgeEndpoint) Value() []byte {
        b, err := json.Marshal(ep)
        if err != nil {
                return nil
        }
        return b
}

func (ep *bridgeEndpoint) SetValue(value []byte) error {
        return json.Unmarshal(value, ep)
}

func (ep *bridgeEndpoint) Index() uint64 {
        return ep.dbIndex
}

func (ep *bridgeEndpoint) SetIndex(index uint64) {
        ep.dbIndex = index
        ep.dbExists = true
}

func (ep *bridgeEndpoint) Exists() bool {
        return ep.dbExists
}

func (ep *bridgeEndpoint) Skip() bool {
        return false
}

func (ep *bridgeEndpoint) New() datastore.KVObject {
        return &bridgeEndpoint{}
}

func (ep *bridgeEndpoint) CopyTo(o datastore.KVObject) error {
        dstEp := o.(*bridgeEndpoint)
        *dstEp = *ep
        return nil
}

// restorePortAllocations is used during live-restore. It re-creates iptables
// forwarding/NAT rules, and restarts docker-proxy, as needed.
//
// TODO(robmry) - if any previously-mapped host ports are no longer available, all
// iptables forwarding/NAT rules get removed and there will be no docker-proxy
// processes. So, the container will be left running, but inaccessible.
func (n *bridgeNetwork) restorePortAllocations(ep *bridgeEndpoint) {
        if ep.extConnConfig == nil ||
                ep.extConnConfig.ExposedPorts == nil ||
                ep.extConnConfig.PortBindings == nil {
                return
        }

        // ep.portMapping has HostPort=HostPortEnd, the host port allocated last
        // time around ... use that in place of ep.extConnConfig.PortBindings, which
        // may specify host port ranges.
        cfg := make([]portmapperapi.PortBindingReq, len(ep.portMapping))
        for i, b := range ep.portMapping {
                cfg[i] = portmapperapi.PortBindingReq{PortBinding: b.PortBinding}
        }

        // Calculate a portBindingMode - it need not be accurate but, if there were
        // IPv4/IPv6 bindings before, ensure they are re-created. (If, for example,
        // there are no IPv6 bindings, it doesn't matter whether that was because this
        // endpoint is not an IPv6 gateway and "pbmIPv6" was not set in the port
        // binding state, or there were just no IPv6 port bindings configured.)
        var pbm portBindingMode
        for _, b := range ep.portMapping {
                if b.HostIP.To4() == nil {
                        pbm.ipv6 = true
                } else {
                        pbm.ipv4 = true
                }
        }

        var err error
        ep.portMapping, err = n.addPortMappings(context.TODO(), ep, cfg, n.config.DefaultBindingIP, pbm)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to reserve existing port mapping for endpoint %.7s:%v", ep.id, err)
        }
        ep.portBindingState = pbm
}

//go:build linux

package bridge

import (
        "errors"
        "fmt"

        "github.com/docker/docker/errdefs"
)

// errInvalidGateway is returned when the user provided default gateway (v4/v6) is not valid.
var errInvalidGateway = errdefs.InvalidParameter(errors.New("default gateway ip must be part of the network"))

// invalidNetworkIDError is returned when the passed
// network id for an existing network is not a known id.
type invalidNetworkIDError string

func (e invalidNetworkIDError) Error() string {
        return fmt.Sprintf("invalid network id %s", string(e))
}

// NotFound denotes the type of this error
func (e invalidNetworkIDError) NotFound() {}

// invalidEndpointIDError is returned when the passed
// endpoint id is not valid.
type invalidEndpointIDError string

func (e invalidEndpointIDError) Error() string {
        return fmt.Sprintf("invalid endpoint id: %s", string(e))
}

// InvalidParameter denotes the type of this error
func (e invalidEndpointIDError) InvalidParameter() {}

// endpointNotFoundError is returned when the no endpoint
// with the passed endpoint id is found.
type endpointNotFoundError string

func (e endpointNotFoundError) Error() string {
        return fmt.Sprintf("endpoint not found: %s", string(e))
}

// NotFound denotes the type of this error
func (e endpointNotFoundError) NotFound() {}

package bridge

import (
        "context"
        "fmt"
        "net"
        "net/netip"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/vishvananda/netlink"
)

const (
        // DefaultBridgeName is the default name for the bridge interface managed
        // by the driver when unspecified by the caller.
        DefaultBridgeName = "docker0"
)

// Interface models the bridge network device.
type bridgeInterface struct {
        Link        netlink.Link
        bridgeIPv4  *net.IPNet
        bridgeIPv6  *net.IPNet
        gatewayIPv4 net.IP
        gatewayIPv6 net.IP
        nlh         nlwrap.Handle
}

// newInterface creates a new bridge interface structure. It attempts to find
// an already existing device identified by the configuration BridgeName field,
// or the default bridge name when unspecified, but doesn't attempt to create
// one when missing
func newInterface(nlh nlwrap.Handle, config *networkConfiguration) (*bridgeInterface, error) {
        var err error
        i := &bridgeInterface{nlh: nlh}

        // Initialize the bridge name to the default if unspecified.
        if config.BridgeName == "" {
                config.BridgeName = DefaultBridgeName
        }

        // Attempt to find an existing bridge named with the specified name.
        i.Link, err = nlh.LinkByName(config.BridgeName)
        if err != nil {
                log.G(context.TODO()).Debugf("Did not find any interface with name %s: %v", config.BridgeName, err)
        } else if _, ok := i.Link.(*netlink.Bridge); !ok {
                return nil, fmt.Errorf("existing interface %s is not a bridge", i.Link.Attrs().Name)
        }
        return i, nil
}

// exists indicates if the existing bridge interface exists on the system.
func (i *bridgeInterface) exists() bool {
        return i.Link != nil
}

// addresses returns a bridge's addresses, IPv4 (with family=netlink.FAMILY_V4)
// or IPv6 (family=netlink.FAMILY_V6).
func (i *bridgeInterface) addresses(family int) ([]netlink.Addr, error) {
        if !i.exists() {
                // A nonexistent interface, by definition, cannot have any addresses.
                return nil, nil
        }
        addrs, err := i.nlh.AddrList(i.Link, family)
        if err != nil {
                return nil, fmt.Errorf("Failed to retrieve addresses: %v", err)
        }
        return addrs, nil
}

func (i *bridgeInterface) programIPv6Addresses(config *networkConfiguration) error {
        // Remember the configured addresses.
        i.bridgeIPv6 = config.AddressIPv6
        i.gatewayIPv6 = config.AddressIPv6.IP

        addrPrefix, ok := netiputil.ToPrefix(config.AddressIPv6)
        if !ok {
                return errdefs.System(
                        fmt.Errorf("failed to convert bridge IPv6 address '%s' to netip.Prefix",
                                config.AddressIPv6.String()))
        }

        // Get the IPv6 addresses currently assigned to the bridge, if any.
        existingAddrs, err := i.addresses(netlink.FAMILY_V6)
        if err != nil {
                return errdefs.System(err)
        }
        // Remove addresses that aren't required.
        for _, existingAddr := range existingAddrs {
                ea, ok := netip.AddrFromSlice(existingAddr.IP)
                if !ok {
                        return errdefs.System(
                                fmt.Errorf("Failed to convert IPv6 address '%s' to netip.Addr", config.AddressIPv6))
                }
                // Don't delete the kernel-assigned link local address (or fe80::1 - if it was
                // assigned to the bridge by an older version of the daemon that deleted the
                // kernel_ll address, the bridge won't get a new kernel_ll address.) But, do
                // delete unexpected link-local addresses (fe80::/10) that aren't in fe80::/64,
                // those have been IPAM-assigned.
                if p, _ := ea.Prefix(64); p == linkLocalPrefix {
                        continue
                }
                // Don't delete multicast addresses as they're never added by the daemon.
                if ea.IsMulticast() {
                        continue
                }
                // Ignore the prefix length when comparing addresses, it's informational
                // (RFC-5942 section 4), and removing/re-adding an address that's still valid
                // would disrupt traffic on live-restore.
                if ea != addrPrefix.Addr() || config.GwModeIPv6.isolated() {
                        err := i.nlh.AddrDel(i.Link, &existingAddr) //#nosec G601 -- Memory aliasing is not an issue in practice as the &existingAddr pointer is not retained by the callee after the AddrDel() call returns.
                        if err != nil {
                                log.G(context.TODO()).WithFields(log.Fields{
                                        "error":   err,
                                        "address": existingAddr.IPNet,
                                },
                                ).Warnf("Failed to remove residual IPv6 address from bridge")
                        }
                }
        }
        // Using AddrReplace(), rather than AddrAdd(). When the subnet is changed for an
        // existing bridge in a way that doesn't affect the bridge's assigned address,
        // the old address has not been removed at this point - because that would be
        // service-affecting for a running container.
        //
        // But if, for example, 'fixed-cidr-v6' is changed from '2000:dbe::/64' to
        // '2000:dbe::/80', the default bridge will still be assigned address
        // '2000:dbe::1'. In the output of 'ip a', the prefix length is displayed - and
        // the user is likely to expect to see it updated from '64' to '80'.
        // Unfortunately, 'netlink.AddrReplace()' ('RTM_NEWADDR' with 'NLM_F_REPLACE')
        // doesn't update the prefix length. This is a cosmetic problem, the prefix
        // length of an assigned address is not used to determine whether an address is
        // "on-link" (RFC-5942).
        if err := i.nlh.AddrReplace(i.Link, &netlink.Addr{
                IPNet: netiputil.ToIPNet(addrPrefix),
                Flags: syscall.IFA_F_NODAD,
        }); err != nil {
                return errdefs.System(fmt.Errorf("failed to add IPv6 address %s to bridge: %v", i.bridgeIPv6, err))
        }
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23 && linux

package firewaller

import (
        "context"
        "fmt"
        "net/netip"
        "slices"

        "github.com/docker/docker/daemon/libnetwork/types"
)

// StubFirewaller implements a Firewaller for unit tests. It just tracks what it's been asked for.
type StubFirewaller struct {
        Config
        Networks map[string]*StubFirewallerNetwork
        FFD      map[IPVersion]bool // filter forward drop
}

func NewStubFirewaller(config Config) *StubFirewaller {
        return &StubFirewaller{
                Config: config,
                // A real Firewaller shouldn't hold on to its own networks, the bridge driver is doing that.
                // But, for unit tests cross-checking the driver, this is useful.
                Networks: make(map[string]*StubFirewallerNetwork),
                FFD:      make(map[IPVersion]bool),
        }
}

func (fw *StubFirewaller) NewNetwork(_ context.Context, nc NetworkConfig) (Network, error) {
        if _, ok := fw.Networks[nc.IfName]; ok {
                return nil, fmt.Errorf("StubFirewaller: network with IfName %q already exists", nc.IfName)
        }
        nw := &StubFirewallerNetwork{
                NetworkConfig: nc,
                Endpoints:     map[stubEndpoint]struct{}{},
                parent:        fw,
        }
        fw.Networks[nc.IfName] = nw
        return nw, nil
}

func (fw *StubFirewaller) FilterForwardDrop(_ context.Context, ipv IPVersion) error {
        fw.FFD[ipv] = true
        return nil
}

type stubFirewallerLink struct {
        parentIP netip.Addr
        childIP  netip.Addr
        ports    []types.TransportPort
}

type stubEndpoint struct {
        addr4 netip.Addr
        addr6 netip.Addr
}

type StubFirewallerNetwork struct {
        NetworkConfig
        Deleted   bool
        Endpoints map[stubEndpoint]struct{}
        Ports     []types.PortBinding
        Links     []stubFirewallerLink

        parent *StubFirewaller
}

func (nw *StubFirewallerNetwork) ReapplyNetworkLevelRules(_ context.Context) error {
        return nil
}

func (nw *StubFirewallerNetwork) DelNetworkLevelRules(_ context.Context) error {
        if _, ok := nw.parent.Networks[nw.IfName]; !ok {
                return fmt.Errorf("StubFirewaller: DelNetworkLevelRules: network '%s' does not exist", nw.IfName)
        }
        // A real firewaller may not report an error if network rules are deleted without
        // per-endpoint/port/link rules being deleted first, the bridge driver is responsible
        // for tracking all that - and it may not be an error if, for example, the driver
        // knows the rules have already been deleted by a firewalld reload. So, this may be
        // wrong for some tests but, for now, cross-check the deletion.
        if len(nw.Endpoints) != 0 {
                return fmt.Errorf("StubFirewaller: DelNetworkLevelRules: network '%s' still has endpoints", nw.IfName)
        }
        if len(nw.Ports) != 0 {
                return fmt.Errorf("StubFirewaller: DelNetworkLevelRules: network '%s' still has ports", nw.IfName)
        }
        if len(nw.Links) != 0 {
                return fmt.Errorf("StubFirewaller: DelNetworkLevelRules: network '%s' still has links", nw.IfName)
        }
        delete(nw.parent.Networks, nw.IfName)
        return nil
}

func (nw *StubFirewallerNetwork) AddEndpoint(_ context.Context, epIPv4, epIPv6 netip.Addr) error {
        ep := stubEndpoint{addr4: epIPv4, addr6: epIPv6}
        if _, ok := nw.Endpoints[ep]; ok {
                return fmt.Errorf("StubFirewaller: AddEndpoint: %s/%s already exists", epIPv4, epIPv6)
        }
        nw.Endpoints[ep] = struct{}{}
        return nil
}

func (nw *StubFirewallerNetwork) DelEndpoint(_ context.Context, epIPv4, epIPv6 netip.Addr) error {
        ep := stubEndpoint{addr4: epIPv4, addr6: epIPv6}
        if _, ok := nw.Endpoints[ep]; !ok {
                return fmt.Errorf("StubFirewaller: DelEndpoint: %s/%s does not exist", epIPv4, epIPv6)
        }
        delete(nw.Endpoints, ep)
        return nil
}

func (nw *StubFirewallerNetwork) AddPorts(_ context.Context, pbs []types.PortBinding) error {
        for _, pb := range pbs {
                if nw.PortExists(pb) {
                        return nil
                }
                nw.Ports = append(nw.Ports, pb.GetCopy())
        }
        return nil
}

func (nw *StubFirewallerNetwork) DelPorts(_ context.Context, pbs []types.PortBinding) error {
        for _, pb := range pbs {
                nw.Ports = slices.DeleteFunc(nw.Ports, func(p types.PortBinding) bool {
                        return p.Equal(&pb)
                })
        }
        return nil
}

func (nw *StubFirewallerNetwork) AddLink(_ context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) error {
        if nw.LinkExists(parentIP, childIP, ports) {
                return nil
        }
        nw.Links = append(nw.Links, stubFirewallerLink{
                parentIP: parentIP,
                childIP:  childIP,
                ports: func() []types.TransportPort {
                        res := make([]types.TransportPort, 0, len(ports))
                        for _, p := range ports {
                                res = append(res, p.GetCopy())
                        }
                        return res
                }(),
        })
        return nil
}

func (nw *StubFirewallerNetwork) DelLink(_ context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) {
        nw.Links = slices.DeleteFunc(nw.Links, func(l stubFirewallerLink) bool {
                return matchLink(l, parentIP, childIP, ports)
        })
}

func (nw *StubFirewallerNetwork) PortExists(pb types.PortBinding) bool {
        return slices.ContainsFunc(nw.Ports, func(p types.PortBinding) bool {
                return p.Equal(&pb)
        })
}

func (nw *StubFirewallerNetwork) LinkExists(parentIP, childIP netip.Addr, ports []types.TransportPort) bool {
        return slices.ContainsFunc(nw.Links, func(l stubFirewallerLink) bool {
                return matchLink(l, parentIP, childIP, ports)
        })
}

func matchLink(l stubFirewallerLink, parentIP, childIP netip.Addr, ports []types.TransportPort) bool {
        if len(l.ports) != len(ports) {
                return false
        }
        for i, p := range l.ports {
                if !p.Equal(&ports[i]) {
                        return false
                }
        }
        return (l.parentIP == parentIP) && (l.childIP == childIP)
}

//go:build linux

package iptabler

import (
        "context"
        "net/netip"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/iptables"
        "github.com/docker/docker/daemon/libnetwork/types"
)

type iptablesCleaner struct {
        config firewaller.Config
}

// NewCleaner checks for iptables rules left behind by an old daemon that was using
// the iptabler.
//
// If there are old rules present, it deletes as much as possible straight away
// (user-defined chains and jumps from the built-in chains).
//
// But, it can't delete network or port-specific rules from built-in chains
// without flushing those chains which would be very antisocial - because, at
// this stage, interface names, addresses, etc. are unknown. So, also return
// FirewallCleaner that the new Firewaller can use to delete those rules while
// it's setting up those networks/ports (probably on replay from persistent
// storage).
//
// If there are no old rules to clean up, return nil.
func NewCleaner(ctx context.Context, config firewaller.Config) firewaller.FirewallCleaner {
        clean := func(ipv iptables.IPVersion, enabled bool) bool {
                if !enabled {
                        return false
                }
                t := iptables.GetIptable(ipv)
                // Since 28.0, the jump in the filter-FORWARD chain is DOCKER-FORWARD.
                // In earlier releases, there was a jump to DOCKER-ISOLATION-STAGE-1.
                if !t.Exists("filter", "FORWARD", "-j", DockerForwardChain) &&
                        !t.Exists("filter", "FORWARD", "-j", isolationChain1) {
                        return false
                }
                log.G(ctx).WithField("ipv", ipv).Info("Cleaning iptables")
                _ = t.DeleteJumpRule("FORWARD", DockerForwardChain)
                _ = deleteLegacyTopLevelRules(ctx, t, ipv)
                removeIPChains(ctx, ipv)
                return true
        }
        cleaned4 := clean(iptables.IPv4, config.IPv4)
        cleaned6 := clean(iptables.IPv6, config.IPv6)
        if !cleaned4 && !cleaned6 {
                return nil
        }
        return &iptablesCleaner{config: config}
}

func (ic iptablesCleaner) DelNetwork(ctx context.Context, nc firewaller.NetworkConfig) {
        if nc.Internal {
                return
        }
        n := network{
                config: nc,
                ipt:    &iptabler{config: ic.config},
        }
        if ic.config.IPv4 && nc.Config4.Prefix.IsValid() {
                _ = deleteLegacyFilterRules(iptables.IPv4, nc.IfName)
                _ = n.setupNonInternalNetworkRules(ctx, iptables.IPv4, nc.Config4, false)
        }
        if ic.config.IPv6 && nc.Config6.Prefix.IsValid() {
                _ = deleteLegacyFilterRules(iptables.IPv6, nc.IfName)
                _ = n.setupNonInternalNetworkRules(ctx, iptables.IPv6, nc.Config6, false)
        }
}

func (ic iptablesCleaner) DelEndpoint(ctx context.Context, nc firewaller.NetworkConfig, epIPv4, epIPv6 netip.Addr) {
        n := network{
                config: nc,
                ipt:    &iptabler{config: ic.config},
        }
        if n.ipt.config.IPv4 && epIPv4.IsValid() {
                _ = n.filterDirectAccess(ctx, iptables.IPv4, n.config.Config4, epIPv4, false)
        }
        if n.ipt.config.IPv6 && epIPv6.IsValid() {
                _ = n.filterDirectAccess(ctx, iptables.IPv6, n.config.Config6, epIPv6, false)
        }
}

func (ic iptablesCleaner) DelPorts(ctx context.Context, nc firewaller.NetworkConfig, pbs []types.PortBinding) {
        n := network{
                config: nc,
                ipt:    &iptabler{config: ic.config},
        }
        _ = n.DelPorts(ctx, pbs)
}

//go:build linux

package iptabler

import (
        "context"
        "net/netip"

        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/iptables"
)

func (n *network) AddEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
        return n.modEndpoint(ctx, epIPv4, epIPv6, true)
}

func (n *network) DelEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
        return n.modEndpoint(ctx, epIPv4, epIPv6, false)
}

func (n *network) modEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr, enable bool) error {
        if n.ipt.config.IPv4 && epIPv4.IsValid() {
                if err := n.filterDirectAccess(ctx, iptables.IPv4, n.config.Config4, epIPv4, enable); err != nil {
                        return err
                }
        }
        if n.ipt.config.IPv6 && epIPv6.IsValid() {
                if err := n.filterDirectAccess(ctx, iptables.IPv6, n.config.Config6, epIPv6, enable); err != nil {
                        return err
                }
        }
        return nil
}

// filterDirectAccess drops packets addressed directly to the container's IP address,
// when direct routing is not permitted by network configuration.
//
// It is a no-op if:
//   - the network is internal
//   - gateway mode is "nat-unprotected" or "routed".
//   - direct routing is enabled at the daemon level.
//   - "raw" rules are disabled (possibly because the host doesn't have the necessary
//     kernel support).
//
// Packets originating on the bridge's own interface and addressed directly to the
// container are allowed - the host always has direct access to its own containers
// (it doesn't need to use the port mapped to its own addresses, although it can).
//
// "Trusted interfaces" are treated in the same way as the bridge itself.
func (n *network) filterDirectAccess(ctx context.Context, ipv iptables.IPVersion, config firewaller.NetworkConfigFam, epIP netip.Addr, enable bool) error {
        if n.config.Internal || config.Unprotected || config.Routed {
                return nil
        }
        // For config that may change between daemon restarts, make sure rules are
        // removed - if the container was left running when the daemon stopped, and
        // direct routing has since been disabled, the rules need to be deleted when
        // cleanup happens on restart. This also means a change in config over a
        // live-restore restart will take effect.
        if n.ipt.config.AllowDirectRouting || rawRulesDisabled(ctx) {
                enable = false
        }
        for _, ifName := range n.config.TrustedHostInterfaces {
                accept := iptables.Rule{IPVer: ipv, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{
                        "-d", epIP.String(),
                        "-i", ifName,
                        "-j", "ACCEPT",
                }}
                if err := appendOrDelChainRule(accept, "DIRECT ACCESS FILTERING - ACCEPT", enable); err != nil {
                        return err
                }
        }
        accept := iptables.Rule{IPVer: ipv, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{
                "-d", epIP.String(),
                "!", "-i", n.config.IfName,
                "-j", "DROP",
        }}
        return appendOrDelChainRule(accept, "DIRECT ACCESS FILTERING - DROP", enable)
}

//go:build linux

package iptabler

import (
        "context"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/internal/modprobe"
        "github.com/docker/docker/daemon/libnetwork/iptables"
)

const (
        // dockerChain: DOCKER iptable chain name
        dockerChain = "DOCKER"
        // DockerForwardChain contains Docker's filter-FORWARD rules.
        //
        // FIXME(robmry) - only exported because it's used to set up the jump to swarm's DOCKER-INGRESS chain.
        DockerForwardChain  = "DOCKER-FORWARD"
        dockerBridgeChain   = "DOCKER-BRIDGE"
        dockerCTChain       = "DOCKER-CT"
        dockerInternalChain = "DOCKER-INTERNAL"

        // These INC (inter-network communication) chains are no longer needed, packets
        // sent to unpublished ports in other networks are now dropped by rules in the DOCKER
        // chain. Packets sent directly to published ports in a different network don't need
        // to be dropped:
        // - containers in other networks have access via the host's address, and
        // - it was surprising that a container in a gwmode=nat network couldn't talk to a
        //   published port in a gwmode=routed network, but anything outside a bridge
        //   network could.
        isolationChain1 = "DOCKER-ISOLATION-STAGE-1"
        isolationChain2 = "DOCKER-ISOLATION-STAGE-2"
)

type iptabler struct {
        config firewaller.Config
}

func NewIptabler(ctx context.Context, config firewaller.Config) (firewaller.Firewaller, error) {
        ipt := &iptabler{config: config}

        if ipt.config.IPv4 {
                removeIPChains(ctx, iptables.IPv4)

                if err := setupIPChains(ctx, iptables.IPv4, ipt.config); err != nil {
                        return nil, err
                }

                // Make sure on firewall reload, first thing being re-played is chains creation
                iptables.OnReloaded(func() {
                        log.G(ctx).Debugf("Recreating iptables chains on firewall reload")
                        if err := setupIPChains(ctx, iptables.IPv4, ipt.config); err != nil {
                                log.G(ctx).WithError(err).Error("Error reloading iptables chains")
                        }
                })
        }

        if ipt.config.IPv6 {
                if err := modprobe.LoadModules(ctx, func() error {
                        iptable := iptables.GetIptable(iptables.IPv6)
                        _, err := iptable.Raw("-t", "filter", "-n", "-L", "FORWARD")
                        return err
                }, "ip6_tables"); err != nil {
                        log.G(ctx).WithError(err).Debug("Loading ip6_tables")
                }

                removeIPChains(ctx, iptables.IPv6)

                err := setupIPChains(ctx, iptables.IPv6, ipt.config)
                if err != nil {
                        // If the chains couldn't be set up, it's probably because the kernel has no IPv6
                        // support, or it doesn't have module ip6_tables loaded. It won't be possible to
                        // create IPv6 networks without enabling ip6_tables in the kernel, or disabling
                        // ip6tables in the daemon config. But, allow the daemon to start because IPv4
                        // will work. So, log the problem, and continue.
                        log.G(ctx).WithError(err).Warn("ip6tables is enabled, but cannot set up ip6tables chains")
                } else {
                        // Make sure on firewall reload, first thing being re-played is chains creation
                        iptables.OnReloaded(func() {
                                log.G(ctx).Debugf("Recreating ip6tables chains on firewall reload")
                                if err := setupIPChains(ctx, iptables.IPv6, ipt.config); err != nil {
                                        log.G(ctx).WithError(err).Error("Error reloading ip6tables chains")
                                }
                        })
                }
        }

        return ipt, nil
}

func (ipt *iptabler) FilterForwardDrop(ctx context.Context, ipv firewaller.IPVersion) error {
        var iptv iptables.IPVersion
        switch ipv {
        case firewaller.IPv4:
                iptv = iptables.IPv4
        case firewaller.IPv6:
                iptv = iptables.IPv6
        default:
                return fmt.Errorf("unknown IP version %v", ipv)
        }
        iptable := iptables.GetIptable(iptv)
        if err := iptable.SetDefaultPolicy(iptables.Filter, "FORWARD", iptables.Drop); err != nil {
                return err
        }
        iptables.OnReloaded(func() {
                log.G(ctx).WithFields(log.Fields{"ipv": ipv}).Debug("Setting the default DROP policy on firewall reload")
                if err := iptable.SetDefaultPolicy(iptables.Filter, "FORWARD", iptables.Drop); err != nil {
                        log.G(ctx).WithFields(log.Fields{
                                "error": err,
                                "ipv":   ipv,
                        }).Warn("Failed to set the default DROP policy on firewall reload")
                }
        })
        return nil
}

func setupIPChains(ctx context.Context, version iptables.IPVersion, iptCfg firewaller.Config) (retErr error) {
        iptable := iptables.GetIptable(version)

        _, err := iptable.NewChain(dockerChain, iptables.Nat)
        if err != nil {
                return fmt.Errorf("failed to create NAT chain %s: %v", dockerChain, err)
        }
        defer func() {
                if retErr != nil {
                        if err := iptable.RemoveExistingChain(dockerChain, iptables.Nat); err != nil {
                                log.G(ctx).Warnf("failed on removing iptables NAT chain %s on cleanup: %v", dockerChain, err)
                        }
                }
        }()

        _, err = iptable.NewChain(dockerChain, iptables.Filter)
        if err != nil {
                return fmt.Errorf("failed to create FILTER chain %s: %v", dockerChain, err)
        }
        defer func() {
                if retErr != nil {
                        if err := iptable.RemoveExistingChain(dockerChain, iptables.Filter); err != nil {
                                log.G(ctx).Warnf("failed on removing iptables FILTER chain %s on cleanup: %v", dockerChain, err)
                        }
                }
        }()

        _, err = iptable.NewChain(DockerForwardChain, iptables.Filter)
        if err != nil {
                return fmt.Errorf("failed to create FILTER chain %s: %v", DockerForwardChain, err)
        }
        defer func() {
                if retErr != nil {
                        if err := iptable.RemoveExistingChain(DockerForwardChain, iptables.Filter); err != nil {
                                log.G(ctx).Warnf("failed on removing iptables FILTER chain %s on cleanup: %v", DockerForwardChain, err)
                        }
                }
        }()

        _, err = iptable.NewChain(dockerBridgeChain, iptables.Filter)
        if err != nil {
                return fmt.Errorf("failed to create FILTER chain %s: %v", dockerBridgeChain, err)
        }
        defer func() {
                if retErr != nil {
                        if err := iptable.RemoveExistingChain(dockerBridgeChain, iptables.Filter); err != nil {
                                log.G(ctx).Warnf("failed on removing iptables FILTER chain %s on cleanup: %v", dockerBridgeChain, err)
                        }
                }
        }()

        _, err = iptable.NewChain(dockerCTChain, iptables.Filter)
        if err != nil {
                return fmt.Errorf("failed to create FILTER chain %s: %v", dockerCTChain, err)
        }
        defer func() {
                if retErr != nil {
                        if err := iptable.RemoveExistingChain(dockerCTChain, iptables.Filter); err != nil {
                                log.G(ctx).Warnf("failed on removing iptables FILTER chain %s on cleanup: %v", dockerCTChain, err)
                        }
                }
        }()

        _, err = iptable.NewChain(dockerInternalChain, iptables.Filter)
        if err != nil {
                return fmt.Errorf("failed to create FILTER internal chain: %v", err)
        }
        defer func() {
                if retErr != nil {
                        if err := iptable.RemoveExistingChain(dockerInternalChain, iptables.Filter); err != nil {
                                log.G(ctx).Warnf("failed on removing iptables FILTER chain %s on cleanup: %v", dockerInternalChain, err)
                        }
                }
        }()

        if err := addNATJumpRules(version, iptCfg.Hairpin, true); err != nil {
                return fmt.Errorf("failed to add jump rules to %s NAT table: %w", version, err)
        }
        defer func() {
                if retErr != nil {
                        if err := addNATJumpRules(version, iptCfg.Hairpin, false); err != nil {
                                log.G(ctx).Warnf("failed on removing jump rules from %s NAT table: %v", version, err)
                        }
                }
        }()

        // Make sure the filter-FORWARD chain has rules to accept related packets and
        // jump to the isolation and docker chains. (Re-)insert at the top of the table,
        // in reverse order.
        if err := iptable.EnsureJumpRule("FORWARD", DockerForwardChain); err != nil {
                return err
        }
        if err := iptable.EnsureJumpRule(DockerForwardChain, dockerBridgeChain); err != nil {
                return err
        }
        if err := iptable.EnsureJumpRule(DockerForwardChain, dockerInternalChain); err != nil {
                return err
        }
        if err := iptable.EnsureJumpRule(DockerForwardChain, dockerCTChain); err != nil {
                return err
        }

        if err := mirroredWSL2Workaround(version, !iptCfg.Hairpin && iptCfg.WSL2Mirrored); err != nil {
                return err
        }

        return deleteLegacyTopLevelRules(ctx, iptable, version)
}

// Delete rules that may have been added to the FORWARD chain by moby 28.0.0 or earlier.
func deleteLegacyTopLevelRules(ctx context.Context, iptable *iptables.IPTable, version iptables.IPVersion) error {
        ipsetName := "docker-ext-bridges-v4"
        if version == iptables.IPv6 {
                ipsetName = "docker-ext-bridges-v6"
        }
        if err := iptable.DeleteJumpRule("FORWARD", dockerChain,
                "-m", "set", "--match-set", ipsetName, "dst"); err != nil {
                log.G(ctx).WithFields(log.Fields{"error": err, "set": ipsetName}).Debug(
                        "deleting legacy ipset dest match rule")
        }
        if err := iptable.DeleteJumpRule("FORWARD", isolationChain1); err != nil {
                return err
        }
        if err := iptable.DeleteJumpRule("FORWARD", "ACCEPT",
                "-m", "set", "--match-set", ipsetName, "dst",
                "-m", "conntrack", "--ctstate", "RELATED,ESTABLISHED",
        ); err != nil {
                log.G(ctx).WithFields(log.Fields{"error": err, "set": ipsetName}).Debug(
                        "deleting legacy ipset conntrack rule")
        }

        return nil
}

func programChainRule(rule iptables.Rule, ruleDescr string, insert bool) error {
        operation := "disable"
        fn := rule.Delete
        if insert {
                operation = "enable"
                fn = rule.Insert
        }
        if err := fn(); err != nil {
                return fmt.Errorf("Unable to %s %s rule: %w", operation, ruleDescr, err)
        }
        return nil
}

func appendOrDelChainRule(rule iptables.Rule, ruleDescr string, shouldAppend bool) error {
        operation := "disable"
        fn := rule.Delete
        if shouldAppend {
                operation = "enable"
                fn = rule.Append
        }
        if err := fn(); err != nil {
                return fmt.Errorf("Unable to %s %s rule: %w", operation, ruleDescr, err)
        }
        return nil
}

//go:build linux

package iptabler

import (
        "context"
        "errors"
        "net/netip"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/iptables"
        "github.com/docker/docker/daemon/libnetwork/types"
)

func (n *network) AddLink(ctx context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) error {
        if !parentIP.IsValid() || parentIP.IsUnspecified() {
                return errors.New("cannot link to a container with an empty parent IP address")
        }
        if !childIP.IsValid() || childIP.IsUnspecified() {
                return errors.New("cannot link to a container with an empty child IP address")
        }

        chain := iptables.ChainInfo{Name: dockerChain}
        for _, port := range ports {
                if err := chain.Link(iptables.Append, parentIP, childIP, int(port.Port), port.Proto.String(), n.config.IfName); err != nil {
                        return err
                }
        }
        return nil
}

func (n *network) DelLink(ctx context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) {
        chain := iptables.ChainInfo{Name: dockerChain}
        for _, port := range ports {
                if err := chain.Link(iptables.Delete, parentIP, childIP, int(port.Port), port.Proto.String(), n.config.IfName); err != nil {
                        log.G(ctx).WithFields(log.Fields{
                                "parentIP": parentIP,
                                "childIP":  childIP,
                                "port":     port.Port,
                                "protocol": port.Proto.String(),
                                "bridge":   n.config.IfName,
                                "error":    err,
                        }).Warn("Failed to remove link between containers")
                }
        }
}

//go:build linux

package iptabler

import (
        "context"
        "errors"
        "fmt"
        "net/netip"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/iptables"
)

type (
        iptableCleanFunc   func() error
        iptablesCleanFuncs []iptableCleanFunc
)

type network struct {
        config     firewaller.NetworkConfig
        ipt        *iptabler
        cleanFuncs iptablesCleanFuncs
}

func (ipt *iptabler) NewNetwork(ctx context.Context, nc firewaller.NetworkConfig) (_ firewaller.Network, retErr error) {
        n := &network{
                ipt:    ipt,
                config: nc,
        }
        defer func() {
                if retErr != nil {
                        if err := n.DelNetworkLevelRules(ctx); err != nil {
                                log.G(ctx).WithError(err).Warnf("Failed to delete network level rules following earlier error")
                        }
                }
        }()

        if err := n.ReapplyNetworkLevelRules(ctx); err != nil {
                return nil, err
        }
        return n, nil
}

func (n *network) ReapplyNetworkLevelRules(ctx context.Context) error {
        if n.ipt.config.IPv4 {
                if err := n.configure(ctx, iptables.IPv4, n.config.Config4); err != nil {
                        return err
                }
        }
        if n.ipt.config.IPv6 {
                if err := n.configure(ctx, iptables.IPv6, n.config.Config6); err != nil {
                        return err
                }
        }
        return nil
}

func (n *network) DelNetworkLevelRules(_ context.Context) error {
        var errs []error
        for _, cleanFunc := range n.cleanFuncs {
                if err := cleanFunc(); err != nil {
                        errs = append(errs, err)
                }
        }
        n.cleanFuncs = nil
        return errors.Join(errs...)
}

func (n *network) configure(ctx context.Context, ipv iptables.IPVersion, conf firewaller.NetworkConfigFam) error {
        if !conf.Prefix.IsValid() {
                return nil
        }
        return n.setupIPTables(ctx, ipv, conf)
}

func (n *network) registerCleanFunc(clean iptableCleanFunc) {
        n.cleanFuncs = append(n.cleanFuncs, clean)
}

func (n *network) setupIPTables(ctx context.Context, ipVersion iptables.IPVersion, config firewaller.NetworkConfigFam) error {
        if n.config.Internal {
                if err := setupInternalNetworkRules(ctx, n.config.IfName, config.Prefix, n.config.ICC, true); err != nil {
                        return fmt.Errorf("Failed to Setup IP tables: %w", err)
                }
                n.registerCleanFunc(func() error {
                        return setupInternalNetworkRules(ctx, n.config.IfName, config.Prefix, n.config.ICC, false)
                })
        } else {
                if err := n.setupNonInternalNetworkRules(ctx, ipVersion, config, true); err != nil {
                        return fmt.Errorf("Failed to Setup IP tables: %w", err)
                }
                n.registerCleanFunc(func() error {
                        return n.setupNonInternalNetworkRules(ctx, ipVersion, config, false)
                })

                if err := deleteLegacyFilterRules(ipVersion, n.config.IfName); err != nil {
                        return fmt.Errorf("failed to delete legacy rules in filter-FORWARD: %w", err)
                }

                err := setDefaultForwardRule(ipVersion, n.config.IfName, config.Unprotected, true)
                if err != nil {
                        return err
                }
                n.registerCleanFunc(func() error {
                        return setDefaultForwardRule(ipVersion, n.config.IfName, config.Unprotected, false)
                })

                ctRule := iptables.Rule{IPVer: ipVersion, Table: iptables.Filter, Chain: dockerCTChain, Args: []string{
                        "-o", n.config.IfName,
                        "-m", "conntrack", "--ctstate", "RELATED,ESTABLISHED",
                        "-j", "ACCEPT",
                }}
                if err := appendOrDelChainRule(ctRule, "bridge ct related", true); err != nil {
                        return err
                }
                n.registerCleanFunc(func() error {
                        return appendOrDelChainRule(ctRule, "bridge ct related", false)
                })
                jumpToDockerRule := iptables.Rule{IPVer: ipVersion, Table: iptables.Filter, Chain: dockerBridgeChain, Args: []string{
                        "-o", n.config.IfName,
                        "-j", dockerChain,
                }}
                if err := appendOrDelChainRule(jumpToDockerRule, "jump to docker", true); err != nil {
                        return err
                }
                n.registerCleanFunc(func() error {
                        return appendOrDelChainRule(jumpToDockerRule, "jump to docker", false)
                })
        }
        return nil
}

func setICMP(ipv iptables.IPVersion, bridgeName string, enable bool) error {
        icmpProto := "icmp"
        if ipv == iptables.IPv6 {
                icmpProto = "icmpv6"
        }
        icmpRule := iptables.Rule{IPVer: ipv, Table: iptables.Filter, Chain: dockerChain, Args: []string{
                "-o", bridgeName,
                "-p", icmpProto,
                "-j", "ACCEPT",
        }}
        return appendOrDelChainRule(icmpRule, "ICMP", enable)
}

func addNATJumpRules(ipVer iptables.IPVersion, hairpinMode, enable bool) error {
        preroute := iptables.Rule{IPVer: ipVer, Table: iptables.Nat, Chain: "PREROUTING", Args: []string{
                "-m", "addrtype",
                "--dst-type", "LOCAL",
                "-j", dockerChain,
        }}
        if enable {
                if err := preroute.Append(); err != nil {
                        return fmt.Errorf("failed to append jump rules to nat-PREROUTING: %s", err)
                }
        } else {
                if err := preroute.Delete(); err != nil {
                        return fmt.Errorf("failed to remove jump rules from nat-PREROUTING: %s", err)
                }
        }

        output := iptables.Rule{IPVer: ipVer, Table: iptables.Nat, Chain: "OUTPUT", Args: []string{
                "-m", "addrtype",
                "--dst-type", "LOCAL",
                "-j", dockerChain,
        }}
        if !hairpinMode {
                output.Args = append(output.Args, "!", "--dst", loopbackAddress(ipVer))
        }
        if enable {
                if err := output.Append(); err != nil {
                        return fmt.Errorf("failed to append jump rules to nat-OUTPUT: %s", err)
                }
        } else {
                if err := output.Delete(); err != nil {
                        return fmt.Errorf("failed to remove jump rules from nat-OUTPUT: %s", err)
                }
        }

        return nil
}

// deleteLegacyFilterRules removes the legacy per-bridge rules from the filter-FORWARD
// chain. This is required for users upgrading the Engine to v28.0.
// TODO(aker): drop this function once Mirantis latest LTS is v28.0 (or higher).
func deleteLegacyFilterRules(ipVer iptables.IPVersion, bridgeName string) error {
        iptable := iptables.GetIptable(ipVer)
        // Delete legacy per-bridge jump to the DOCKER chain from the FORWARD chain, if it exists.
        // These rules have been replaced by an ipset-matching rule.
        link := []string{
                "-o", bridgeName,
                "-j", dockerChain,
        }
        if iptable.Exists(iptables.Filter, "FORWARD", link...) {
                del := append([]string{string(iptables.Delete), "FORWARD"}, link...)
                if output, err := iptable.Raw(del...); err != nil {
                        return err
                } else if len(output) != 0 {
                        return fmt.Errorf("could not delete linking rule from %s-%s: %s", iptables.Filter, dockerChain, output)
                }
        }

        // Delete legacy per-bridge related/established rule if it exists. These rules
        // have been replaced by an ipset-matching rule.
        establish := []string{
                "-o", bridgeName,
                "-m", "conntrack",
                "--ctstate", "RELATED,ESTABLISHED",
                "-j", "ACCEPT",
        }
        if iptable.Exists(iptables.Filter, "FORWARD", establish...) {
                del := append([]string{string(iptables.Delete), "FORWARD"}, establish...)
                if output, err := iptable.Raw(del...); err != nil {
                        return err
                } else if len(output) != 0 {
                        return fmt.Errorf("could not delete establish rule from %s-%s: %s", iptables.Filter, dockerChain, output)
                }
        }

        return nil
}

// loopbackAddress returns the loopback address for the given IP version.
func loopbackAddress(version iptables.IPVersion) string {
        switch version {
        case iptables.IPv4, "":
                // IPv4 (default for backward-compatibility)
                return "127.0.0.0/8"
        case iptables.IPv6:
                return "::1/128"
        default:
                panic("unknown IP version: " + version)
        }
}

func setDefaultForwardRule(ipVersion iptables.IPVersion, ifName string, unprotected bool, enable bool) error {
        // Normally, DROP anything that hasn't been ACCEPTed by a per-port/protocol
        // rule. This prevents direct access to un-mapped ports from remote hosts
        // that can route directly to the container's address (by setting up a
        // route via the host's address).
        action := "DROP"
        if unprotected {
                // If the user really wants to allow all access from the wider network,
                // explicitly ACCEPT anything so that the filter-FORWARD chain's
                // default policy can't interfere.
                action = "ACCEPT"
        }

        rule := iptables.Rule{IPVer: ipVersion, Table: iptables.Filter, Chain: dockerChain, Args: []string{
                "!", "-i", ifName,
                "-o", ifName,
                "-j", action,
        }}

        // Append to the filter table's DOCKER chain (the default rule must follow
        // per-port ACCEPT rules, which will be inserted at the top of the chain).
        if err := appendOrDelChainRule(rule, "DEFAULT FWD", enable); err != nil {
                return fmt.Errorf("failed to add default-drop rule: %w", err)
        }
        return nil
}

func (n *network) setupNonInternalNetworkRules(ctx context.Context, ipVer iptables.IPVersion, config firewaller.NetworkConfigFam, enable bool) error {
        var natArgs, hpNatArgs []string
        if config.HostIP.IsValid() {
                // The user wants IPv4/IPv6 SNAT with the given address.
                hostAddr := config.HostIP.String()
                natArgs = []string{"-s", config.Prefix.String(), "!", "-o", n.config.IfName, "-j", "SNAT", "--to-source", hostAddr}
                hpNatArgs = []string{"-m", "addrtype", "--src-type", "LOCAL", "-o", n.config.IfName, "-j", "SNAT", "--to-source", hostAddr}
        } else {
                // Use MASQUERADE, which picks the src-ip based on next-hop from the route table
                natArgs = []string{"-s", config.Prefix.String(), "!", "-o", n.config.IfName, "-j", "MASQUERADE"}
                hpNatArgs = []string{"-m", "addrtype", "--src-type", "LOCAL", "-o", n.config.IfName, "-j", "MASQUERADE"}
        }
        natRule := iptables.Rule{IPVer: ipVer, Table: iptables.Nat, Chain: "POSTROUTING", Args: natArgs}
        hpNatRule := iptables.Rule{IPVer: ipVer, Table: iptables.Nat, Chain: "POSTROUTING", Args: hpNatArgs}

        // Set NAT.
        nat := !config.Routed
        if n.config.Masquerade {
                if nat {
                        if err := programChainRule(natRule, "NAT", enable); err != nil {
                                return err
                        }
                }
        }

        // In hairpin mode, masquerade traffic from localhost. If hairpin is disabled or if we're tearing down
        // that bridge, make sure the iptables rule isn't lying around.
        if err := programChainRule(hpNatRule, "MASQ LOCAL HOST", enable && n.ipt.config.Hairpin); err != nil {
                return err
        }

        // Set Inter Container Communication.
        if err := setIcc(ctx, ipVer, n.config.IfName, n.config.ICC, false, enable); err != nil {
                return err
        }

        // Allow ICMP in routed mode.
        if !nat {
                if err := setICMP(ipVer, n.config.IfName, enable); err != nil {
                        return err
                }
        }

        // Handle outgoing packets. This rule was previously added unconditionally
        // to ACCEPT packets that weren't ICC - an extra rule was needed to enable
        // ICC if needed. Those rules are now combined. So, outRuleNoICC is only
        // needed for ICC=false, along with the DROP rule for ICC added by setIcc.
        outRuleNoICC := iptables.Rule{IPVer: ipVer, Table: iptables.Filter, Chain: DockerForwardChain, Args: []string{
                "-i", n.config.IfName,
                "!", "-o", n.config.IfName,
                "-j", "ACCEPT",
        }}
        // If there's a version of outRuleNoICC in the FORWARD chain, created by moby 28.0.0 or older, delete it.
        if enable {
                if err := outRuleNoICC.WithChain("FORWARD").Delete(); err != nil {
                        return fmt.Errorf("deleting FORWARD chain outRuleNoICC: %w", err)
                }
        }
        if n.config.ICC {
                // Accept outgoing traffic to anywhere, including other containers on this bridge.
                outRuleICC := iptables.Rule{IPVer: ipVer, Table: iptables.Filter, Chain: DockerForwardChain, Args: []string{
                        "-i", n.config.IfName,
                        "-j", "ACCEPT",
                }}
                if err := appendOrDelChainRule(outRuleICC, "ACCEPT OUTGOING", enable); err != nil {
                        return err
                }
                // If there's a version of outRuleICC in the FORWARD chain, created by moby 28.0.0 or older, delete it.
                if enable {
                        if err := outRuleICC.WithChain("FORWARD").Delete(); err != nil {
                                return fmt.Errorf("deleting FORWARD chain outRuleICC: %w", err)
                        }
                }
        } else {
                // Accept outgoing traffic to anywhere, apart from other containers on this bridge.
                // setIcc added a DROP rule for ICC traffic.
                if err := appendOrDelChainRule(outRuleNoICC, "ACCEPT NON_ICC OUTGOING", enable); err != nil {
                        return err
                }
        }

        return nil
}

func setIcc(ctx context.Context, version iptables.IPVersion, bridgeIface string, iccEnable, internal, insert bool) error {
        args := []string{"-i", bridgeIface, "-o", bridgeIface, "-j"}
        acceptRule := iptables.Rule{IPVer: version, Table: iptables.Filter, Chain: DockerForwardChain, Args: append(args, "ACCEPT")}
        dropRule := iptables.Rule{IPVer: version, Table: iptables.Filter, Chain: DockerForwardChain, Args: append(args, "DROP")}

        // The accept rule is no longer required for a bridge with external connectivity, because
        // ICC traffic is allowed by the outgoing-packets rule created by setupIptablesInternal.
        // The accept rule is still required for a --internal network because it has no outgoing
        // rule. If insert and the rule is not required, an ACCEPT rule for an external network
        // may have been left behind by an older version of the daemon so, delete it.
        if insert && iccEnable && internal {
                if err := acceptRule.Append(); err != nil {
                        return fmt.Errorf("Unable to allow intercontainer communication: %w", err)
                }
        } else {
                if err := acceptRule.Delete(); err != nil {
                        log.G(ctx).WithError(err).Warn("Failed to delete legacy ICC accept rule")
                }
        }

        if insert && !iccEnable {
                if err := dropRule.Append(); err != nil {
                        return fmt.Errorf("Unable to prevent intercontainer communication: %w", err)
                }
        } else {
                if err := dropRule.Delete(); err != nil {
                        log.G(ctx).WithError(err).Warn("Failed to delete ICC drop rule")
                }
        }

        // Delete rules that may have been inserted into the FORWARD chain by moby 28.0.0 or older.
        if insert {
                if err := acceptRule.WithChain("FORWARD").Delete(); err != nil {
                        return fmt.Errorf("deleting FORWARD chain accept rule: %w", err)
                }
                if err := dropRule.WithChain("FORWARD").Delete(); err != nil {
                        return fmt.Errorf("deleting FORWARD chain drop rule: %w", err)
                }
        }
        return nil
}

// Obsolete chain from previous docker versions
const oldIsolationChain = "DOCKER-ISOLATION"

func removeIPChains(ctx context.Context, version iptables.IPVersion) {
        ipt := iptables.GetIptable(version)

        // Remove obsolete rules from default chains
        ipt.ProgramRule(iptables.Filter, "FORWARD", iptables.Delete, []string{"-j", oldIsolationChain})

        // Remove chains
        for _, chainInfo := range []iptables.ChainInfo{
                {Name: dockerChain, Table: iptables.Nat, IPVersion: version},
                {Name: DockerForwardChain, Table: iptables.Filter, IPVersion: version},
                {Name: dockerBridgeChain, Table: iptables.Filter, IPVersion: version},
                {Name: dockerChain, Table: iptables.Filter, IPVersion: version},
                {Name: dockerCTChain, Table: iptables.Filter, IPVersion: version},
                {Name: dockerInternalChain, Table: iptables.Filter, IPVersion: version},
                {Name: isolationChain1, Table: iptables.Filter, IPVersion: version},
                {Name: isolationChain2, Table: iptables.Filter, IPVersion: version},
                {Name: oldIsolationChain, Table: iptables.Filter, IPVersion: version},
        } {
                if err := chainInfo.Remove(); err != nil {
                        log.G(ctx).Warnf("Failed to remove existing iptables entries in table %s chain %s : %v", chainInfo.Table, chainInfo.Name, err)
                }
        }
}

func setupInternalNetworkRules(ctx context.Context, bridgeIface string, prefix netip.Prefix, icc, insert bool) error {
        var version iptables.IPVersion
        var inDropRule, outDropRule iptables.Rule

        if prefix.Addr().Is4() {
                version = iptables.IPv4
                inDropRule = iptables.Rule{
                        IPVer: version,
                        Table: iptables.Filter,
                        Chain: dockerInternalChain,
                        Args:  []string{"-i", bridgeIface, "!", "-d", prefix.String(), "-j", "DROP"},
                }
                outDropRule = iptables.Rule{
                        IPVer: version,
                        Table: iptables.Filter,
                        Chain: dockerInternalChain,
                        Args:  []string{"-o", bridgeIface, "!", "-s", prefix.String(), "-j", "DROP"},
                }
        } else {
                version = iptables.IPv6
                inDropRule = iptables.Rule{
                        IPVer: version,
                        Table: iptables.Filter,
                        Chain: dockerInternalChain,
                        Args:  []string{"-i", bridgeIface, "!", "-o", bridgeIface, "!", "-d", prefix.String(), "-j", "DROP"},
                }
                outDropRule = iptables.Rule{
                        IPVer: version,
                        Table: iptables.Filter,
                        Chain: dockerInternalChain,
                        Args:  []string{"!", "-i", bridgeIface, "-o", bridgeIface, "!", "-s", prefix.String(), "-j", "DROP"},
                }
        }

        if err := programChainRule(inDropRule, "DROP INCOMING", insert); err != nil {
                return err
        }
        if err := programChainRule(outDropRule, "DROP OUTGOING", insert); err != nil {
                return err
        }

        // Set Inter Container Communication.
        return setIcc(ctx, version, bridgeIface, icc, true, insert)
}

//go:build linux

package iptabler

import (
        "context"
        "net"
        "os"
        "strconv"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/iptables"
        "github.com/docker/docker/daemon/libnetwork/types"
)

func (n *network) AddPorts(ctx context.Context, pbs []types.PortBinding) error {
        return n.modPorts(ctx, pbs, true)
}

func (n *network) DelPorts(ctx context.Context, pbs []types.PortBinding) error {
        return n.modPorts(ctx, pbs, false)
}

func (n *network) modPorts(ctx context.Context, pbs []types.PortBinding, enable bool) error {
        for _, pb := range pbs {
                if err := n.setPerPortIptables(ctx, pb, enable); err != nil {
                        return err
                }
        }
        return nil
}

// setPerPortIptables configures rules required by port binding b. Rules are added if
// enable is true, else removed.
func (n *network) setPerPortIptables(ctx context.Context, b types.PortBinding, enable bool) error {
        v := iptables.IPv4
        enabled := n.ipt.config.IPv4
        config := n.config.Config4
        if b.IP.To4() == nil {
                v = iptables.IPv6
                enabled = n.ipt.config.IPv6
                config = n.config.Config6
        }

        if !enabled || n.config.Internal {
                // Nothing to do.
                return nil
        }

        if err := filterPortMappedOnLoopback(ctx, b, b.HostIP, n.ipt.config.WSL2Mirrored, enable); err != nil {
                return err
        }

        if err := n.dropLegacyFilterDirectAccess(ctx, b); err != nil {
                return err
        }

        if (b.IP.To4() != nil) != (b.HostIP.To4() != nil) {
                // The binding is between containerV4 and hostV6 (not vice versa as that
                // will have been rejected earlier). It's handled by docker-proxy. So, no
                // further iptables rules are required.
                return nil
        }

        if err := n.setPerPortNAT(v, b, enable); err != nil {
                return err
        }

        if !config.Unprotected {
                if err := setPerPortForwarding(b, v, n.config.IfName, enable); err != nil {
                        return err
                }
        }
        return nil
}

// setPerPortNAT configures DNAT and MASQUERADE rules for port binding b. Rules are added if
// enable is true, else removed.
func (n *network) setPerPortNAT(ipv iptables.IPVersion, b types.PortBinding, enable bool) error {
        if b.HostPort == 0 {
                // NAT is disabled.
                return nil
        }
        // iptables interprets "0.0.0.0" as "0.0.0.0/32", whereas we
        // want "0.0.0.0/0". "0/0" is correctly interpreted as "any
        // value" by both iptables and ip6tables.
        hostIP := "0/0"
        if !b.HostIP.IsUnspecified() {
                hostIP = b.HostIP.String()
        }
        args := []string{
                "-p", b.Proto.String(),
                "-d", hostIP,
                "--dport", strconv.Itoa(int(b.HostPort)),
                "-j", "DNAT",
                "--to-destination", net.JoinHostPort(b.IP.String(), strconv.Itoa(int(b.Port))),
        }
        if !n.ipt.config.Hairpin {
                args = append(args, "!", "-i", n.config.IfName)
        }
        if ipv == iptables.IPv6 {
                args = append(args, "!", "-s", "fe80::/10")
        }
        rule := iptables.Rule{IPVer: ipv, Table: iptables.Nat, Chain: dockerChain, Args: args}
        if err := appendOrDelChainRule(rule, "DNAT", enable); err != nil {
                return err
        }

        rule = iptables.Rule{IPVer: ipv, Table: iptables.Nat, Chain: "POSTROUTING", Args: []string{
                "-p", b.Proto.String(),
                "-s", b.IP.String(),
                "-d", b.IP.String(),
                "--dport", strconv.Itoa(int(b.Port)),
                "-j", "MASQUERADE",
        }}
        if err := appendOrDelChainRule(rule, "MASQUERADE", n.ipt.config.Hairpin && enable); err != nil {
                return err
        }

        return nil
}

// setPerPortForwarding opens access to a container's published port, as described by binding b.
// It also does something weird, broken, and disabled-by-default related to SCTP. Rules are added
// if enable is true, else removed.
func setPerPortForwarding(b types.PortBinding, ipv iptables.IPVersion, bridgeName string, enable bool) error {
        // Insert rules for open ports at the top of the filter table's DOCKER
        // chain (a per-network DROP rule, which must come after these per-port
        // per-container ACCEPT rules, is appended to the chain when the network
        // is created).
        rule := iptables.Rule{IPVer: ipv, Table: iptables.Filter, Chain: dockerChain, Args: []string{
                "!", "-i", bridgeName,
                "-o", bridgeName,
                "-p", b.Proto.String(),
                "-d", b.IP.String(),
                "--dport", strconv.Itoa(int(b.Port)),
                "-j", "ACCEPT",
        }}
        if err := programChainRule(rule, "OPEN PORT", enable); err != nil {
                return err
        }

        // TODO(robmry) - remove, see https://github.com/moby/moby/pull/48149
        if b.Proto == types.SCTP && os.Getenv("DOCKER_IPTABLES_SCTP_CHECKSUM") == "1" {
                // Linux kernel v4.9 and below enables NETIF_F_SCTP_CRC for veth by
                // the following commit.
                // This introduces a problem when combined with a physical NIC without
                // NETIF_F_SCTP_CRC. As for a workaround, here we add an iptables entry
                // to fill the checksum.
                //
                // https://github.com/torvalds/linux/commit/c80fafbbb59ef9924962f83aac85531039395b18
                rule := iptables.Rule{IPVer: ipv, Table: iptables.Mangle, Chain: "POSTROUTING", Args: []string{
                        "-p", b.Proto.String(),
                        "--sport", strconv.Itoa(int(b.Port)),
                        "-j", "CHECKSUM",
                        "--checksum-fill",
                }}
                if err := appendOrDelChainRule(rule, "SCTP CHECKSUM", enable); err != nil {
                        return err
                }
        }

        return nil
}

// filterPortMappedOnLoopback adds an iptables rule that drops remote
// connections to ports mapped on loopback addresses.
//
// This is a no-op if the portBinding is for IPv6 (IPv6 loopback address is
// non-routable), or over a network with gw_mode=routed (PBs in routed mode
// don't map ports on the host).
func filterPortMappedOnLoopback(ctx context.Context, b types.PortBinding, hostIP net.IP, wsl2Mirrored, enable bool) error {
        if rawRulesDisabled(ctx) {
                return nil
        }
        if b.HostPort == 0 || !hostIP.IsLoopback() || hostIP.To4() == nil {
                return nil
        }

        acceptMirrored := iptables.Rule{IPVer: iptables.IPv4, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{
                "-p", b.Proto.String(),
                "-d", hostIP.String(),
                "--dport", strconv.Itoa(int(b.HostPort)),
                "-i", "loopback0",
                "-j", "ACCEPT",
        }}
        enableMirrored := enable && wsl2Mirrored
        if err := appendOrDelChainRule(acceptMirrored, "LOOPBACK FILTERING - ACCEPT MIRRORED", enableMirrored); err != nil {
                return err
        }

        drop := iptables.Rule{IPVer: iptables.IPv4, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{
                "-p", b.Proto.String(),
                "-d", hostIP.String(),
                "--dport", strconv.Itoa(int(b.HostPort)),
                "!", "-i", "lo",
                "-j", "DROP",
        }}
        if err := appendOrDelChainRule(drop, "LOOPBACK FILTERING - DROP", enable); err != nil {
                return err
        }

        return nil
}

// dropLegacyFilterDirectAccess deletes a rule that was introduced in 28.0.0 to
// drop 'direct' remote connections made to the container's IP address - for
// each published port on the container.
//
// The normal filter-FORWARD rules would then drop packets sent directly to
// unpublished ports. This rule was only created along with the rest of port
// publishing (when a container's endpoint was selected as its gateway). Until
// then, all packets addressed directly to the container's ports were dropped
// by the filter-FORWARD rules.
//
// Since 28.0.2, direct routed packets sent to a container's address are all
// dropped in a raw-PREROUTING rule - it doesn't need to be per-port (so, fewer
// rules), and it can be created along with the endpoint (so directly-routed
// packets are dropped at the same point whether or not the endpoint is currently
// the gateway - so, very slightly earlier when it's not the gateway).
//
// This function was a no-op if the gw_mode was "nat-unprotected" or "routed".
// It still is. but now always deletes the rule if it might have been created
// by an older version of the daemon.
//
// TODO(robmry) - remove this once there's no upgrade path from 28.0.x or 28.1.x.
func (n *network) dropLegacyFilterDirectAccess(ctx context.Context, b types.PortBinding) error {
        if rawRulesDisabled(ctx) {
                return nil
        }
        ipv := iptables.IPv4
        config := n.config.Config4
        if b.IP.To4() == nil {
                ipv = iptables.IPv6
                config = n.config.Config6
        }

        // gw_mode=nat-unprotected means there's minimal security for NATed ports,
        // so don't filter direct access.
        if config.Unprotected || config.Routed {
                return nil
        }

        drop := iptables.Rule{IPVer: ipv, Table: iptables.Raw, Chain: "PREROUTING", Args: []string{
                "-p", b.Proto.String(),
                "-d", b.IP.String(), // Container IP address
                "--dport", strconv.Itoa(int(b.Port)), // Container port
                "!", "-i", n.config.IfName,
                "-j", "DROP",
        }}
        if err := appendOrDelChainRule(drop, "LEGACY DIRECT ACCESS FILTERING - DROP", false); err != nil {
                return err
        }

        return nil
}

func rawRulesDisabled(ctx context.Context) bool {
        if os.Getenv("DOCKER_INSECURE_NO_IPTABLES_RAW") == "1" {
                log.G(ctx).Debug("DOCKER_INSECURE_NO_IPTABLES_RAW=1 - skipping raw rules")
                return true
        }
        return false
}

//go:build linux

package iptabler

import (
        "github.com/docker/docker/daemon/libnetwork/iptables"
)

// mirroredWSL2Workaround adds or removes an IPv4 NAT rule, depending on whether
// docker's host Linux appears to be a guest running under WSL2 in with mirrored
// mode networking.
// https://learn.microsoft.com/en-us/windows/wsl/networking#mirrored-mode-networking
//
// Without mirrored mode networking, or for a packet sent from Linux, packets
// sent to 127.0.0.1 are processed as outgoing - they hit the nat-OUTPUT chain,
// which does not jump to the nat-DOCKER chain because the rule has an exception
// for "-d 127.0.0.0/8". The default action on the nat-OUTPUT chain is ACCEPT (by
// default), so the packet is delivered to 127.0.0.1 on lo, where docker-proxy
// picks it up and acts as a man-in-the-middle; it receives the packet and
// re-sends it to the container (or acks a SYN and sets up a second TCP
// connection to the container). So, the container sees packets arrive with a
// source address belonging to the network's bridge, and it is able to reply to
// that address.
//
// In WSL2's mirrored networking mode, Linux has a loopback0 device as well as lo
// (which owns 127.0.0.1 as normal). Packets sent to 127.0.0.1 from Windows to a
// server listening on Linux's 127.0.0.1 are delivered via loopback0, and
// processed as packets arriving from outside the Linux host (which they are).
//
// So, these packets hit the nat-PREROUTING chain instead of nat-OUTPUT. It would
// normally be impossible for a packet ->127.0.0.1 to arrive from outside the
// host, so the nat-PREROUTING jump to nat-DOCKER has no exception for it. The
// packet is processed by a per-bridge DNAT rule in that chain, so it is
// delivered directly to the container (not via docker-proxy) with source address
// 127.0.0.1, so the container can't respond.
//
// DNAT is normally skipped by RETURN rules in the nat-DOCKER chain for packets
// arriving from any other bridge network. Similarly, this function adds (or
// removes) a rule to RETURN early for packets delivered via loopback0 with
// destination 127.0.0.0/8.
func mirroredWSL2Workaround(ipv iptables.IPVersion, enable bool) error {
        // WSL2 does not (currently) support Windows<->Linux communication via ::1.
        if ipv != iptables.IPv4 {
                return nil
        }
        return programChainRule(iptables.Rule{
                IPVer: iptables.IPv4,
                Table: iptables.Nat,
                Chain: dockerChain,
                Args:  []string{"-i", "loopback0", "-d", "127.0.0.0/8", "-j", "RETURN"},
        }, "WSL2 loopback", enable)
}

//go:build linux

package nftabler

import (
        "context"
        "os/exec"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
)

// Cleanup deletes all rules created by nftabler; it's intended to be used
// during startup, to clean up rules created by an old incarnation of the daemon
// after switching to a different Firewaller implementation.
func Cleanup(ctx context.Context, config firewaller.Config) {
        if config.IPv4 {
                if err := exec.Command("nft", "delete", "table", string(nftables.IPv4), dockerTable).Run(); err != nil {
                        log.G(ctx).WithError(err).Info("Deleting nftables IPv4 rules")
                } else {
                        log.G(ctx).Info("Deleted nftables IPv4 rules")
                }
        }
        if config.IPv6 {
                if err := exec.Command("nft", "delete", "table", string(nftables.IPv6), dockerTable).Run(); err != nil {
                        log.G(ctx).WithError(err).Info("Deleting nftables IPv6 rules")
                } else {
                        log.G(ctx).Info("Deleted nftables IPv6 rules")
                }
        }
}

func (nft *nftabler) SetFirewallCleaner(fc firewaller.FirewallCleaner) {
        nft.cleaner = fc
}

//go:build linux

package nftabler

import (
        "context"
        "fmt"
        "net/netip"
        "strings"

        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
)

func (n *network) AddEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
        if n.fw.cleaner != nil {
                n.fw.cleaner.DelEndpoint(ctx, n.config, epIPv4, epIPv6)
        }
        return n.modEndpoint(ctx, epIPv4, epIPv6, true)
}

func (n *network) DelEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
        return n.modEndpoint(ctx, epIPv4, epIPv6, false)
}

func (n *network) modEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr, enable bool) error {
        if n.fw.config.IPv4 && epIPv4.IsValid() {
                if err := n.filterDirectAccess(ctx, n.fw.table4, n.config.Config4, epIPv4, enable); err != nil {
                        return err
                }
                if err := nftApply(ctx, n.fw.table4); err != nil {
                        return fmt.Errorf("adding rules for bridge %s: %w", n.config.IfName, err)
                }
        }
        if n.fw.config.IPv6 && epIPv6.IsValid() {
                if err := n.filterDirectAccess(ctx, n.fw.table6, n.config.Config6, epIPv6, enable); err != nil {
                        return err
                }
                if err := nftApply(ctx, n.fw.table6); err != nil {
                        return fmt.Errorf("adding rules for bridge %s: %w", n.config.IfName, err)
                }
        }
        return nil
}

// filterDirectAccess drops packets addressed directly to the container's IP address,
// when direct routing is not permitted by network configuration.
//
// It is a no-op if:
//   - gateway mode is "nat-unprotected" or "routed".
//   - direct routing is enabled at the daemon level.
//   - "raw" rules are disabled (possibly because the host doesn't have the necessary
//     kernel support).
//
// Packets originating on the bridge's own interface and addressed directly to the
// container are allowed - the host always has direct access to its own containers
// (it doesn't need to use the port mapped to its own addresses, although it can).
//
// "Trusted interfaces" are treated in the same way as the bridge itself.
func (n *network) filterDirectAccess(ctx context.Context, table nftables.TableRef, conf firewaller.NetworkConfigFam, epIP netip.Addr, enable bool) error {
        if n.config.Internal || conf.Unprotected || conf.Routed || n.fw.config.AllowDirectRouting {
                return nil
        }
        updater := table.ChainUpdateFunc(ctx, rawPreroutingChain, enable)
        ifNames := strings.Join(n.config.TrustedHostInterfaces, ", ")
        return updater(ctx, rawPreroutingPortsRuleGroup,
                `%s daddr %s iifname != { %s, %s } counter drop comment "DROP DIRECT ACCESS"`,
                table.Family(), epIP, n.config.IfName, ifNames)
}

//go:build linux

package nftabler

import (
        "context"
        "errors"
        "fmt"
        "net/netip"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
)

func (n *network) AddLink(ctx context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) error {
        if !parentIP.IsValid() || parentIP.IsUnspecified() {
                return errors.New("cannot link to a container with an empty parent IP address")
        }
        if !childIP.IsValid() || childIP.IsUnspecified() {
                return errors.New("cannot link to a container with an empty child IP address")
        }

        chain := n.fw.table4.Chain(ctx, chainFilterFwdIn(n.config.IfName))
        for _, port := range ports {
                for _, rule := range legacyLinkRules(parentIP, childIP, port) {
                        if err := chain.AppendRule(ctx, fwdInLegacyLinksRuleGroup, rule); err != nil {
                                return err
                        }
                }
        }
        if err := nftApply(ctx, n.fw.table4); err != nil {
                return fmt.Errorf("adding rules for bridge %s: %w", n.config.IfName, err)
        }
        return nil
}

func (n *network) DelLink(ctx context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) {
        chain := n.fw.table4.Chain(ctx, chainFilterFwdIn(n.config.IfName))
        for _, port := range ports {
                for _, rule := range legacyLinkRules(parentIP, childIP, port) {
                        if err := chain.DeleteRule(ctx, fwdInLegacyLinksRuleGroup, rule); err != nil {
                                log.G(ctx).WithFields(log.Fields{
                                        "rule":  rule,
                                        "error": err,
                                }).Warn("Failed to remove link between containers")
                        }
                }
        }
        if err := nftApply(ctx, n.fw.table4); err != nil {
                log.G(ctx).WithError(err).Warn("Removing link, failed to update nftables")
        }
}

func legacyLinkRules(parentIP, childIP netip.Addr, port types.TransportPort) []string {
        // TODO(robmry) - could combine rules for each proto by using an anonymous set.
        return []string{
                // Match the iptables implementation, but without checking iifname/oifname (not needed
                // because the addresses belong to the bridge).
                fmt.Sprintf("ip saddr %s ip daddr %s %s dport %d counter accept", parentIP.Unmap(), childIP.Unmap(), port.Proto, port.Port),
                // Conntrack will allow responses. So, this must be to allow unsolicited packets from an exposed port.
                fmt.Sprintf("ip daddr %s ip saddr %s %s sport %d counter accept", parentIP.Unmap(), childIP.Unmap(), port.Proto, port.Port),
        }
}

//go:build linux

package nftabler

import (
        "context"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
        "github.com/docker/docker/internal/cleanups"
        "go.opentelemetry.io/otel"
)

type network struct {
        config  firewaller.NetworkConfig
        cleaner func(ctx context.Context) error
        fw      *nftabler
}

func (nft *nftabler) NewNetwork(ctx context.Context, nc firewaller.NetworkConfig) (_ firewaller.Network, retErr error) {
        n := &network{
                fw:     nft,
                config: nc,
        }
        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{"bridge": n.config.IfName}))

        var cleaner cleanups.Composite
        defer func() {
                if err := cleaner.Call(ctx); err != nil {
                        log.G(ctx).WithError(err).Warn("Failed to clean up nftables rules for network")
                }
        }()

        if nft.cleaner != nil {
                nft.cleaner.DelNetwork(ctx, nc)
        }

        if n.fw.config.IPv4 {
                clean, err := n.configure(ctx, nft.table4, n.config.Config4)
                if err != nil {
                        return nil, err
                }
                if clean != nil {
                        cleaner.Add(clean)
                }
        }
        if n.fw.config.IPv6 {
                clean, err := n.configure(ctx, nft.table6, n.config.Config6)
                if err != nil {
                        return nil, err
                }
                if clean != nil {
                        cleaner.Add(clean)
                }
        }

        n.cleaner = cleaner.Release()
        return n, nil
}

func (n *network) configure(ctx context.Context, table nftables.TableRef, conf firewaller.NetworkConfigFam) (func(context.Context) error, error) {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".newNetwork."+string(table.Family()))
        defer span.End()

        if !conf.Prefix.IsValid() {
                return nil, nil
        }

        var cleanup cleanups.Composite
        defer cleanup.Call(ctx)
        var applied bool
        cleanup.Add(func(ctx context.Context) error {
                if applied {
                        return nftApply(ctx, table)
                }
                return nil
        })

        // Filter chain

        fwdInChain := table.Chain(ctx, chainFilterFwdIn(n.config.IfName))
        cleanup.Add(func(ctx context.Context) error { return table.DeleteChain(ctx, chainFilterFwdIn(n.config.IfName)) })
        fwdOutChain := table.Chain(ctx, chainFilterFwdOut(n.config.IfName))
        cleanup.Add(func(ctx context.Context) error { return table.DeleteChain(ctx, chainFilterFwdOut(n.config.IfName)) })

        cf, err := table.InterfaceVMap(ctx, filtFwdInVMap).AddElementCf(ctx, n.config.IfName, "jump "+chainFilterFwdIn(n.config.IfName))
        if err != nil {
                return nil, fmt.Errorf("adding filter-forward jump for %s to %q: %w", conf.Prefix, chainFilterFwdIn(n.config.IfName), err)
        }
        cleanup.Add(cf)

        cf, err = table.InterfaceVMap(ctx, filtFwdOutVMap).AddElementCf(ctx, n.config.IfName, "jump "+chainFilterFwdOut(n.config.IfName))
        if err != nil {
                return nil, fmt.Errorf("adding filter-forward jump for %s to %q: %w", conf.Prefix, chainFilterFwdOut(n.config.IfName), err)
        }
        cleanup.Add(cf)

        // NAT chain

        natPostroutingIn := table.Chain(ctx, chainNatPostRtIn(n.config.IfName))
        cleanup.Add(func(ctx context.Context) error { return table.DeleteChain(ctx, chainNatPostRtIn(n.config.IfName)) })
        cf, err = table.InterfaceVMap(ctx, natPostroutingInVMap).AddElementCf(ctx, n.config.IfName, "jump "+chainNatPostRtIn(n.config.IfName))
        if err != nil {
                return nil, fmt.Errorf("adding postrouting ingress jump for %s to %q: %w", conf.Prefix, chainNatPostRtIn(n.config.IfName), err)
        }
        cleanup.Add(cf)

        natPostroutingOut := table.Chain(ctx, chainNatPostRtOut(n.config.IfName))
        cleanup.Add(func(ctx context.Context) error { return table.DeleteChain(ctx, chainNatPostRtOut(n.config.IfName)) })
        cf, err = table.InterfaceVMap(ctx, natPostroutingOutVMap).AddElementCf(ctx, n.config.IfName, "jump "+chainNatPostRtOut(n.config.IfName))
        if err != nil {
                return nil, fmt.Errorf("adding postrouting egress jump for %s to %q: %w", conf.Prefix, chainNatPostRtOut(n.config.IfName), err)
        }
        cleanup.Add(cf)

        // Conntrack

        cf, err = fwdInChain.AppendRuleCf(ctx, initialRuleGroup, "ct state established,related counter accept")
        if err != nil {
                return nil, fmt.Errorf("adding conntrack ingress rule for %q: %w", n.config.IfName, err)
        }
        cleanup.Add(cf)

        cf, err = fwdOutChain.AppendRuleCf(ctx, initialRuleGroup, "ct state established,related counter accept")
        if err != nil {
                return nil, fmt.Errorf("adding conntrack egress rule for %q: %w", n.config.IfName, err)
        }
        cleanup.Add(cf)

        iccVerdict := "accept"
        if !n.config.ICC {
                iccVerdict = "drop"
        }

        if n.config.Internal {
                // Drop anything that's not from this network.
                cf, err = fwdInChain.AppendRuleCf(ctx, initialRuleGroup,
                        `iifname != %s counter drop comment "INTERNAL NETWORK INGRESS"`, n.config.IfName)
                if err != nil {
                        return nil, fmt.Errorf("adding INTERNAL NETWORK ingress rule for %q: %w", n.config.IfName, err)
                }
                cleanup.Add(cf)

                cf, err = fwdOutChain.AppendRuleCf(ctx, initialRuleGroup,
                        `oifname != %s counter drop comment "INTERNAL NETWORK EGRESS"`, n.config.IfName)
                if err != nil {
                        return nil, fmt.Errorf("adding INTERNAL NETWORK egress rule for %q: %w", n.config.IfName, err)
                }
                cleanup.Add(cf)

                // Accept or drop Inter-Container Communication.
                cf, err = fwdInChain.AppendRuleCf(ctx, fwdInICCRuleGroup, "counter %s comment ICC", iccVerdict)
                if err != nil {
                        return nil, fmt.Errorf("adding ICC ingress rule for %q: %w", n.config.IfName, err)
                }
                cleanup.Add(cf)
        } else {
                // Inter-Container Communication
                cf, err = fwdInChain.AppendRuleCf(ctx, fwdInICCRuleGroup, "iifname == %s counter %s comment ICC",
                        n.config.IfName, iccVerdict)
                if err != nil {
                        return nil, fmt.Errorf("adding ICC rule for %q: %w", n.config.IfName, err)
                }
                cleanup.Add(cf)

                // Outgoing traffic
                cf, err = fwdOutChain.AppendRuleCf(ctx, initialRuleGroup, "counter accept comment OUTGOING")
                if err != nil {
                        return nil, fmt.Errorf("adding OUTGOING rule for %q: %w", n.config.IfName, err)
                }
                cleanup.Add(cf)

                // Incoming traffic
                if conf.Unprotected {
                        cf, err = fwdInChain.AppendRuleCf(ctx, fwdInFinalRuleGroup, `counter accept comment "UNPROTECTED"`)
                        if err != nil {
                                return nil, fmt.Errorf("adding UNPROTECTED for %q: %w", n.config.IfName, err)
                        }
                        cleanup.Add(cf)
                } else {
                        cf, err = fwdInChain.AppendRuleCf(ctx, fwdInFinalRuleGroup, `counter drop comment "UNPUBLISHED PORT DROP"`)
                        if err != nil {
                                return nil, fmt.Errorf("adding UNPUBLISHED PORT DROP for %q: %w", n.config.IfName, err)
                        }
                        cleanup.Add(cf)
                }

                // ICMP
                if conf.Routed {
                        rule := "ip protocol icmp"
                        if table.Family() == nftables.IPv6 {
                                rule = "meta l4proto ipv6-icmp"
                        }
                        cf, err = fwdInChain.AppendRuleCf(ctx, initialRuleGroup, rule+" counter accept comment ICMP")
                        if err != nil {
                                return nil, fmt.Errorf("adding ICMP rule for %q: %w", n.config.IfName, err)
                        }
                        cleanup.Add(cf)
                }

                // Masquerade / SNAT - masquerade picks a source IP address based on next-hop, SNAT uses conf.HostIP.
                natPostroutingVerdict := "masquerade"
                natPostroutingComment := "MASQUERADE"
                if conf.HostIP.IsValid() {
                        natPostroutingVerdict = "snat to " + conf.HostIP.Unmap().String()
                        natPostroutingComment = "SNAT"
                }
                if n.config.Masquerade && !conf.Routed {
                        cf, err = natPostroutingOut.AppendRuleCf(ctx, initialRuleGroup, `oifname != %s %s saddr %s counter %s comment "%s"`,
                                n.config.IfName, table.Family(), conf.Prefix, natPostroutingVerdict, natPostroutingComment)
                        if err != nil {
                                return nil, fmt.Errorf("adding NAT rule for %q: %w", n.config.IfName, err)
                        }
                        cleanup.Add(cf)
                }
                if n.fw.config.Hairpin {
                        // Masquerade/SNAT traffic from localhost.
                        cf, err = natPostroutingIn.AppendRuleCf(ctx, initialRuleGroup, `fib saddr type local counter %s comment "%s FROM HOST"`,
                                natPostroutingVerdict, natPostroutingComment)
                        if err != nil {
                                return nil, fmt.Errorf("adding NAT local rule for %q: %w", n.config.IfName, err)
                        }
                        cleanup.Add(cf)
                }
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                "bridge": n.config.IfName,
                "family": table.Family(),
        }))
        if err := nftApply(ctx, table); err != nil {
                return nil, fmt.Errorf("adding rules for bridge %s: %w", n.config.IfName, err)
        }
        applied = true

        return cleanup.Release(), nil
}

func (n *network) ReapplyNetworkLevelRules(ctx context.Context) error {
        // A firewalld reload doesn't delete nftables rules, this function is not needed.
        log.G(ctx).Warn("ReapplyNetworkLevelRules is not implemented for nftables")
        return nil
}

func (n *network) DelNetworkLevelRules(ctx context.Context) error {
        if n.cleaner != nil {
                ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{"bridge": n.config.IfName}))
                if err := n.cleaner(ctx); err != nil {
                        log.G(ctx).WithError(err).Warn("Failed to remove network rules for network")
                }
                n.cleaner = nil
        }
        return nil
}

func chainFilterFwdIn(ifName string) string {
        return "filter-forward-in__" + ifName
}

func chainFilterFwdOut(ifName string) string {
        return "filter-forward-out__" + ifName
}

func chainNatPostRtOut(ifName string) string {
        return "nat-postrouting-out__" + ifName
}

func chainNatPostRtIn(ifName string) string {
        return "nat-postrouting-in__" + ifName
}

//go:build linux

package nftabler

import (
        "context"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
        "go.opentelemetry.io/otel"
)

// Prefix for OTEL span names.
const spanPrefix = "libnetwork.drivers.bridge.nftabler"

const (
        dockerTable           = "docker-bridges"
        forwardChain          = "filter-FORWARD"
        postroutingChain      = "nat-POSTROUTING"
        preroutingChain       = "nat-PREROUTING"
        outputChain           = "nat-OUTPUT"
        natChain              = "nat-prerouting-and-output"
        rawPreroutingChain    = "raw-PREROUTING"
        filtFwdInVMap         = "filter-forward-in-jumps"
        filtFwdOutVMap        = "filter-forward-out-jumps"
        natPostroutingOutVMap = "nat-postrouting-out-jumps"
        natPostroutingInVMap  = "nat-postrouting-in-jumps"
)

const (
        initialRuleGroup nftables.RuleGroup = iota
)

const (
        fwdInLegacyLinksRuleGroup = iota + initialRuleGroup + 1
        fwdInICCRuleGroup
        fwdInPortsRuleGroup
        fwdInFinalRuleGroup
)

const (
        rawPreroutingPortsRuleGroup = iota + initialRuleGroup + 1
)

type nftabler struct {
        config  firewaller.Config
        cleaner firewaller.FirewallCleaner
        table4  nftables.TableRef
        table6  nftables.TableRef
}

func NewNftabler(ctx context.Context, config firewaller.Config) (firewaller.Firewaller, error) {
        nft := &nftabler{config: config}

        if nft.config.IPv4 {
                var err error
                nft.table4, err = nft.init(ctx, nftables.IPv4)
                if err != nil {
                        return nil, err
                }
                if err := nftApply(ctx, nft.table4); err != nil {
                        return nil, fmt.Errorf("IPv4 initialisation: %w", err)
                }
        }

        if nft.config.IPv6 {
                var err error
                nft.table6, err = nft.init(ctx, nftables.IPv6)
                if err != nil {
                        return nil, err
                }

                if err := nftApply(ctx, nft.table6); err != nil {
                        // Perhaps the kernel has no IPv6 support. It won't be possible to create IPv6
                        // networks without enabling ip6_tables in the kernel, or disabling ip6tables in
                        // the daemon config. But, allow the daemon to start because IPv4 will work. So,
                        // log the problem, and continue.
                        log.G(ctx).WithError(err).Warn("ip6tables is enabled, but cannot set up IPv6 nftables table")
                }
        }

        return nft, nil
}

func (nft *nftabler) getTable(ipv firewaller.IPVersion) nftables.TableRef {
        if ipv == firewaller.IPv4 {
                return nft.table4
        }
        return nft.table6
}

func (nft *nftabler) FilterForwardDrop(ctx context.Context, ipv firewaller.IPVersion) error {
        table := nft.getTable(ipv)
        if err := table.Chain(ctx, forwardChain).SetPolicy("drop"); err != nil {
                return err
        }
        return nftApply(ctx, table)
}

// init creates the bridge driver's nftables table for IPv4 or IPv6.
func (nft *nftabler) init(ctx context.Context, family nftables.Family) (nftables.TableRef, error) {
        // Instantiate the table.
        table, err := nftables.NewTable(family, dockerTable)
        if err != nil {
                return table, err
        }

        // Set up the filter forward chain.
        //
        // This base chain only contains two rules that use verdict maps:
        // - if a packet is entering a bridge network, jump to that network's filter-forward ingress chain.
        // - if a packet is leaving a bridge network, jump to that network's filter-forward egress chain.
        //
        // So, packets that aren't related to docker don't need to traverse any per-network filter forward
        // rules - and packets that are entering or leaving docker networks only need to traverse rules
        // related to those networks.
        fwdChain, err := table.BaseChain(ctx, forwardChain,
                nftables.BaseChainTypeFilter,
                nftables.BaseChainHookForward,
                nftables.BaseChainPriorityFilter)
        if err != nil {
                return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
        }
        // Instantiate the verdict maps and add the jumps.
        _ = table.InterfaceVMap(ctx, filtFwdInVMap)
        if err := fwdChain.AppendRule(ctx, initialRuleGroup, "oifname vmap @"+filtFwdInVMap); err != nil {
                return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
        }
        _ = table.InterfaceVMap(ctx, filtFwdOutVMap)
        if err := fwdChain.AppendRule(ctx, initialRuleGroup, "iifname vmap @"+filtFwdOutVMap); err != nil {
                return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
        }

        // Set up the NAT postrouting base chain.
        //
        // Like the filter-forward chain, its only rules are jumps to network-specific ingress and egress chains.
        natPostRtChain, err := table.BaseChain(ctx, postroutingChain,
                nftables.BaseChainTypeNAT,
                nftables.BaseChainHookPostrouting,
                nftables.BaseChainPrioritySrcNAT)
        if err != nil {
                return nftables.TableRef{}, err
        }
        _ = table.InterfaceVMap(ctx, natPostroutingOutVMap)
        if err := natPostRtChain.AppendRule(ctx, initialRuleGroup, "iifname vmap @"+natPostroutingOutVMap); err != nil {
                return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
        }
        _ = table.InterfaceVMap(ctx, natPostroutingInVMap)
        if err := natPostRtChain.AppendRule(ctx, initialRuleGroup, "oifname vmap @"+natPostroutingInVMap); err != nil {
                return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
        }

        // Instantiate natChain, for the NAT prerouting and output base chains to jump to.
        _ = table.Chain(ctx, natChain)

        // Set up the NAT prerouting base chain.
        natPreRtChain, err := table.BaseChain(ctx, preroutingChain,
                nftables.BaseChainTypeNAT,
                nftables.BaseChainHookPrerouting,
                nftables.BaseChainPriorityDstNAT)
        if err != nil {
                return nftables.TableRef{}, err
        }
        if err := natPreRtChain.AppendRule(ctx, initialRuleGroup, "fib daddr type local counter jump "+natChain); err != nil {
                return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
        }

        // Set up the NAT output base chain
        natOutputChain, err := table.BaseChain(ctx, outputChain,
                nftables.BaseChainTypeNAT,
                nftables.BaseChainHookOutput,
                nftables.BaseChainPriorityDstNAT)
        if err != nil {
                return nftables.TableRef{}, err
        }
        // For output, don't jump to the NAT chain if hairpin is enabled (no userland proxy).
        var skipLoopback string
        if !nft.config.Hairpin {
                if family == nftables.IPv4 {
                        skipLoopback = "ip daddr != 127.0.0.1/8 "
                } else {
                        skipLoopback = "ip6 daddr != ::1 "
                }
        }
        if err := natOutputChain.AppendRule(ctx, initialRuleGroup, skipLoopback+"fib daddr type local counter jump "+natChain); err != nil {
                return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
        }

        // Set up the raw prerouting base chain
        if _, err := table.BaseChain(ctx, rawPreroutingChain,
                nftables.BaseChainTypeFilter,
                nftables.BaseChainHookPrerouting,
                nftables.BaseChainPriorityRaw); err != nil {
                return nftables.TableRef{}, err
        }

        if !nft.config.Hairpin && nft.config.WSL2Mirrored {
                if err := mirroredWSL2Workaround(ctx, table); err != nil {
                        return nftables.TableRef{}, err
                }
        }

        return table, nil
}

func nftApply(ctx context.Context, table nftables.TableRef) error {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".nftApply."+string(table.Family()))
        defer span.End()
        if err := table.Apply(ctx); err != nil {
                return fmt.Errorf("applying nftables rules: %w", err)
        }
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.22 && linux

package nftabler

import (
        "context"
        "errors"
        "fmt"
        "net"
        "strconv"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
        "github.com/docker/docker/daemon/libnetwork/types"
)

type pbContext struct {
        table nftables.TableRef
        conf  firewaller.NetworkConfigFam
        ipv   firewaller.IPVersion
}

func (n *network) AddPorts(ctx context.Context, pbs []types.PortBinding) error {
        return n.modPorts(ctx, pbs, true)
}

func (n *network) DelPorts(ctx context.Context, pbs []types.PortBinding) error {
        return n.modPorts(ctx, pbs, false)
}

func (n *network) modPorts(ctx context.Context, pbs []types.PortBinding, enable bool) error {
        if n.config.Internal {
                return nil
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{"bridge": n.config.IfName}))

        if enable && n.fw.cleaner != nil {
                n.fw.cleaner.DelPorts(ctx, n.config, pbs)
        }

        pbs4, pbs6 := splitByContainerFam(pbs)
        if n.fw.config.IPv4 && n.config.Config4.Prefix.IsValid() {
                pbc := pbContext{table: n.fw.table4, conf: n.config.Config4, ipv: firewaller.IPv4}
                if err := n.setPerPortRules(ctx, pbs4, pbc, enable); err != nil {
                        return err
                }
        }
        if n.fw.config.IPv6 && n.config.Config6.Prefix.IsValid() {
                pbc := pbContext{table: n.fw.table6, conf: n.config.Config6, ipv: firewaller.IPv6}
                if err := n.setPerPortRules(ctx, pbs6, pbc, enable); err != nil {
                        return err
                }
        }
        return nil
}

func splitByContainerFam(pbs []types.PortBinding) ([]types.PortBinding, []types.PortBinding) {
        var pbs4, pbs6 []types.PortBinding
        for _, pb := range pbs {
                if pb.IP.To4() != nil {
                        pbs4 = append(pbs4, pb)
                } else {
                        pbs6 = append(pbs6, pb)
                }
        }
        return pbs4, pbs6
}

func (n *network) setPerPortRules(ctx context.Context, pbs []types.PortBinding, pbc pbContext, enable bool) error {
        if err := n.setPerPortForwarding(ctx, pbs, pbc, enable); err != nil {
                return err
        }
        if err := n.setPerPortDNAT(ctx, pbs, pbc, enable); err != nil {
                return err
        }
        if err := n.setPerPortHairpinMasq(ctx, pbs, pbc, enable); err != nil {
                return err
        }
        if err := n.filterPortMappedOnLoopback(ctx, pbs, pbc, enable); err != nil {
                return err
        }
        if err := nftApply(ctx, pbc.table); err != nil {
                return fmt.Errorf("adding rules for bridge %s: %w", n.config.IfName, err)
        }
        return nil
}

func (n *network) setPerPortForwarding(ctx context.Context, pbs []types.PortBinding, pbc pbContext, enable bool) error {
        if pbc.conf.Unprotected {
                return nil
        }
        updateFwdIn := pbc.table.ChainUpdateFunc(ctx, chainFilterFwdIn(n.config.IfName), enable)
        for _, pb := range pbs {
                // When more than one host port is mapped to a single container port, this will
                // generate the same rule for each host port. So, ignore duplicates when adding,
                // and missing rules when removing. (No ref-counting is currently needed because
                // when bindings are added or removed for an endpoint, they're all added or
                // removed for an address family. So, a rule that's added more than once will
                // also be deleted more than once.)
                //
                // TODO(robmry) - track port mappings, use that to edit nftables sets when bindings are added/removed.
                rule := fmt.Sprintf("%s daddr %s %s dport %d counter accept", pbc.table.Family(), pb.IP, pb.Proto, pb.Port)
                if err := updateFwdIn(ctx, fwdInPortsRuleGroup, rule); err != nil &&
                        !errors.Is(err, nftables.ErrRuleExist) && !errors.Is(err, nftables.ErrRuleNotExist) {
                        return fmt.Errorf("updating forwarding rule for port %s %s:%d/%s on %s, enable=%v: %w",
                                pbc.table.Family(), pb.IP, pb.Port, pb.Proto, n.config.IfName, enable, err)
                }
        }
        return nil
}

func (n *network) setPerPortDNAT(ctx context.Context, pbs []types.PortBinding, pbc pbContext, enable bool) error {
        updater := pbc.table.ChainUpdateFunc(ctx, natChain, enable)
        var proxySkip string
        if !n.fw.config.Hairpin {
                proxySkip = fmt.Sprintf("iifname != %s ", n.config.IfName)
        }
        var v6LLSkip string
        if pbc.ipv == firewaller.IPv6 {
                v6LLSkip = "ip6 saddr != fe80::/10 "
        }
        for _, pb := range pbs {
                // Nothing to do if NAT is disabled.
                if pb.HostPort == 0 {
                        continue
                }
                // If the binding is between containerV4 and hostV6, NAT isn't possible (the mapping
                // is handled by docker-proxy).
                if (pb.IP.To4() != nil) != (pb.HostIP.To4() != nil) {
                        continue
                }
                var daddrMatch string
                if !pb.HostIP.IsUnspecified() {
                        daddrMatch = fmt.Sprintf("%s daddr %s ", pbc.table.Family(), pb.HostIP)
                }
                rule := fmt.Sprintf("%s%s%s%s dport %d counter dnat to %s comment DNAT",
                        proxySkip, v6LLSkip, daddrMatch, pb.Proto, pb.HostPort,
                        net.JoinHostPort(pb.IP.String(), strconv.Itoa(int(pb.Port))))
                if err := updater(ctx, initialRuleGroup, rule); err != nil {
                        return fmt.Errorf("adding DNAT for %s %s:%d -> %s:%d/%s on %s: %w",
                                pbc.table.Family(), pb.HostIP, pb.HostPort, pb.IP, pb.Port, pb.Proto, n.config.IfName, err)
                }
        }
        return nil
}

// setPerPortHairpinMasq allows containers to access their own published ports on the host
// when hairpin is enabled (no docker-proxy), by masquerading.
func (n *network) setPerPortHairpinMasq(ctx context.Context, pbs []types.PortBinding, pbc pbContext, enable bool) error {
        if !n.fw.config.Hairpin {
                return nil
        }
        updater := pbc.table.ChainUpdateFunc(ctx, chainNatPostRtIn(n.config.IfName), enable)
        for _, pb := range pbs {
                // Nothing to do if NAT is disabled.
                if pb.HostPort == 0 {
                        continue
                }
                // If the binding is between containerV4 and hostV6, NAT isn't possible (it's
                // handled by docker-proxy).
                if (pb.IP.To4() != nil) != (pb.HostIP.To4() != nil) {
                        continue
                }
                // When more than one host port is mapped to a single container port, this will
                // generate the same rule for each host port. So, ignore duplicates when adding,
                // and missing rules when removing. (No ref-counting is currently needed because
                // when bindings are added or removed for an endpoint, they're all added or
                // removed. So, a rule that's added more than once will also be deleted more
                // than once.)
                //
                // TODO(robmry) - track port mappings, use that to edit nftables sets when bindings are added/removed.
                rule := fmt.Sprintf(`%s saddr %s %s daddr %s %s dport %d counter masquerade comment "MASQ TO OWN PORT"`,
                        pbc.table.Family(), pb.IP, pbc.table.Family(), pb.IP, pb.Proto, pb.Port)
                if err := updater(ctx, initialRuleGroup, rule); err != nil &&
                        !errors.Is(err, nftables.ErrRuleExist) && !errors.Is(err, nftables.ErrRuleNotExist) {
                        return fmt.Errorf("adding MASQ TO OWN PORT for %d -> %s:%d/%s: %w",
                                pb.Port, pb.IP, pb.Port, pb.Proto, err)
                }
        }
        return nil
}

// filterPortMappedOnLoopback adds a rule that drops remote connections to ports
// mapped to loopback addresses.
//
// This is a no-op if the portBinding is for IPv6 (IPv6 loopback address is
// non-routable), or over a network with gw_mode=routed (PBs in routed mode
// don't map ports on the host).
func (n *network) filterPortMappedOnLoopback(ctx context.Context, pbs []types.PortBinding, pbc pbContext, enable bool) error {
        if pbc.ipv == firewaller.IPv6 {
                return nil
        }
        updater := pbc.table.ChainUpdateFunc(ctx, rawPreroutingChain, enable)
        for _, pb := range pbs {
                // Nothing to do if not binding to the loopback address.
                if pb.HostPort == 0 || !pb.HostIP.IsLoopback() {
                        continue
                }
                // Mappings from host IPv6 to container IPv4 are handled by docker-proxy.
                if pb.HostIP.To4() == nil {
                        continue
                }
                if n.fw.config.WSL2Mirrored {
                        if err := updater(ctx, rawPreroutingPortsRuleGroup,
                                `iifname loopback0 ip daddr %s %s dport %d counter accept comment "%s"`,
                                pb.HostIP, pb.Proto, pb.HostPort, "ACCEPT WSL2 LOOPBACK"); err != nil {
                                return fmt.Errorf("adding WSL2 loopback rule for %d: %w", pb.HostPort, err)
                        }
                }
                if err := updater(ctx, rawPreroutingPortsRuleGroup,
                        `iifname != lo ip daddr %s %s dport %d counter drop comment "DROP REMOTE LOOPBACK"`,
                        pb.HostIP, pb.Proto, pb.HostPort); err != nil {
                        return fmt.Errorf("adding loopback filter rule for %d: %w", pb.HostPort, err)
                }
        }

        return nil
}

//go:build linux

package nftabler

import (
        "context"

        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
)

// mirroredWSL2Workaround adds IPv4 NAT rule if docker's host Linux appears to
// be a guest running under WSL2 in with mirrored mode networking.
// https://learn.microsoft.com/en-us/windows/wsl/networking#mirrored-mode-networking
//
// Without mirrored mode networking, or for a packet sent from Linux, packets
// sent to 127.0.0.1 are processed as outgoing - they hit the nat-OUTPUT chain,
// which does not jump to the nat-DOCKER chain because the rule has an exception
// for "-d 127.0.0.0/8". The default action on the nat-OUTPUT chain is ACCEPT (by
// default), so the packet is delivered to 127.0.0.1 on lo, where docker-proxy
// picks it up and acts as a man-in-the-middle; it receives the packet and
// re-sends it to the container (or acks a SYN and sets up a second TCP
// connection to the container). So, the container sees packets arrive with a
// source address belonging to the network's bridge, and it is able to reply to
// that address.
//
// In WSL2's mirrored networking mode, Linux has a loopback0 device as well as lo
// (which owns 127.0.0.1 as normal). Packets sent to 127.0.0.1 from Windows to a
// server listening on Linux's 127.0.0.1 are delivered via loopback0, and
// processed as packets arriving from outside the Linux host (which they are).
//
// So, these packets hit the nat-PREROUTING chain instead of nat-OUTPUT. It would
// normally be impossible for a packet ->127.0.0.1 to arrive from outside the
// host, so the nat-PREROUTING jump to nat-DOCKER has no exception for it. The
// packet is processed by a per-bridge DNAT rule in that chain, so it is
// delivered directly to the container (not via docker-proxy) with source address
// 127.0.0.1, so the container can't respond.
//
// DNAT is normally skipped by RETURN rules in the nat-DOCKER chain for packets
// arriving from any other bridge network. Similarly, this function adds (or
// removes) a rule to RETURN early for packets delivered via loopback0 with
// destination 127.0.0.0/8.
func mirroredWSL2Workaround(ctx context.Context, table nftables.TableRef) error {
        // WSL2 does not (currently) support Windows<->Linux communication via ::1.
        if table.Family() != nftables.IPv4 {
                return nil
        }
        return table.Chain(ctx, natChain).AppendRule(ctx,
                initialRuleGroup, `iifname "loopback0" ip daddr 127.0.0.0/8 counter return`)
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package bridge

import (
        "context"
        "errors"
        "fmt"
        "net"
        "slices"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/portmapperapi"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/sliceutil"
)

// addPortMappings takes cfg, the configuration for port mappings, selects host
// ports when ranges are given, binds host ports to check they're available and
// reserve them, starts docker-proxy if required, and sets up iptables
// NAT/forwarding rules as necessary. If anything goes wrong, it will undo any
// work it's done and return an error. Otherwise, the returned slice of
// PortBinding has an entry per address family (if cfg describes a mapping for
// 'any' host address, it's expanded into mappings for IPv4 and IPv6, because
// that's how the mapping is presented in 'inspect'). HostPort and HostPortEnd in
// each returned PortBinding are set to the selected and reserved port.
func (n *bridgeNetwork) addPortMappings(
        ctx context.Context,
        ep *bridgeEndpoint,
        cfg []portmapperapi.PortBindingReq,
        defHostIP net.IP,
        pbmReq portBindingMode,
) (_ []portmapperapi.PortBinding, retErr error) {
        if len(defHostIP) == 0 {
                defHostIP = net.IPv4zero
        } else if addr4 := defHostIP.To4(); addr4 != nil {
                // Unmap the address if it's IPv4-mapped IPv6.
                defHostIP = addr4
        }

        pms := n.portMappers()

        bindings := make([]portmapperapi.PortBinding, 0, len(cfg)*2)
        defer func() {
                if retErr != nil {
                        if err := n.unmapPBs(ctx, bindings); err != nil {
                                log.G(ctx).WithFields(log.Fields{
                                        "bindings": bindings,
                                        "error":    err,
                                        "origErr":  retErr,
                                }).Warn("Failed to unmap port bindings after error")
                        }
                }
        }()

        bindingReqs := n.sortAndNormPBs(ctx, ep, cfg, defHostIP, pbmReq)

        // toBind accumulates port bindings that should be allocated the same host port
        // (if required by NAT config). If the host address is unspecified, and defHostIP
        // is 0.0.0.0, one iteration of the loop may generate bindings for v4 and v6. If
        // a host address is specified, it'll either be IPv4 or IPv6, and only one
        // binding will be added per iteration. Config for bindings that only differ in
        // host IP are sorted next to each other, the loop continues until toBind has
        // collected them all, for both v4 and v6. The addresses may be 0.0.0.0 and [::],
        // or multiple addresses of both address families. Once there are no more
        // bindings to collect, they're applied and toBind is reset.
        var toBind []portmapperapi.PortBindingReq
        for i, c := range bindingReqs {
                toBind = append(toBind, c)
                if i < len(bindingReqs)-1 && c.Mapper == bindingReqs[i+1].Mapper && needSamePort(c, bindingReqs[i+1]) {
                        // This port binding matches the next, apart from host IP. So, continue
                        // collecting bindings, then allocate the same host port for all addresses.
                        continue
                }

                pm, err := pms.Get(c.Mapper)
                if err != nil {
                        return nil, err
                }

                newB, err := pm.MapPorts(ctx, toBind, n.firewallerNetwork)
                if err != nil {
                        return nil, err
                }
                bindings = append(bindings, sliceutil.Map(newB, func(b portmapperapi.PortBinding) portmapperapi.PortBinding {
                        b.Mapper = c.Mapper
                        return b
                })...)

                // Reset toBind now the ports are bound.
                toBind = toBind[:0]
        }

        return bindings, nil
}

// sortAndNormPBs transforms cfg into a list of portBindingReq, with all fields
// normalized:
//
//   - HostPortEnd=HostPort (rather than 0) if the host port isn't a range
//   - HostIP is set to the default host IP if not specified, and the binding is
//     NATed
//   - DisableNAT is set if the binding is routed, and HostIP is cleared
//
// When no HostIP is specified, and the default HostIP is 0.0.0.0, a duplicate
// IPv6 port binding is created with the same port and protocol, but with
// HostIP set to [::].
//
// Finally, port bindings are sorted into the ordering defined by
// [PortBindingReqs.Compare] in order to form groups of port bindings that
// should be processed in one go.
func (n *bridgeNetwork) sortAndNormPBs(
        ctx context.Context,
        ep *bridgeEndpoint,
        cfg []portmapperapi.PortBindingReq,
        defHostIP net.IP,
        pbmReq portBindingMode,
) []portmapperapi.PortBindingReq {
        var containerIPv4, containerIPv6 net.IP
        if ep.addr != nil {
                containerIPv4 = ep.addr.IP
        }
        if ep.addrv6 != nil {
                containerIPv6 = ep.addrv6.IP
        }

        hairpin := n.hairpin()
        disableNAT4, disableNAT6 := n.getNATDisabled()

        add4 := !ep.portBindingState.ipv4 && pbmReq.ipv4
        add6 := !ep.portBindingState.ipv6 && pbmReq.ipv6

        reqs := make([]portmapperapi.PortBindingReq, 0, len(cfg))
        for _, c := range cfg {
                if c.HostPortEnd == 0 {
                        c.HostPortEnd = c.HostPort
                }

                if add4 {
                        if bindingIPv4, ok := configurePortBindingIPv4(ctx, disableNAT4, c, containerIPv4, defHostIP); ok {
                                reqs = append(reqs, bindingIPv4)
                        }
                }

                // If the container has no IPv6 address, allow proxying host IPv6 traffic to it
                // by setting up the binding with the IPv4 interface if the userland proxy is enabled
                // This change was added to keep backward compatibility
                containerIP := containerIPv6
                if containerIPv6 == nil && pbmReq.ipv4 && add6 {
                        if hairpin {
                                // There's no way to map from host-IPv6 to container-IPv4 with the userland proxy
                                // disabled.
                                // If that is required, don't treat it as an error because, as networks are
                                // connected/disconnected, the container's gateway endpoint might change to a
                                // network where this config makes more sense.
                                if len(c.HostIP) > 0 && c.HostIP.To4() == nil {
                                        log.G(ctx).WithFields(log.Fields{"mapping": c}).Info(
                                                "Cannot map from IPv6 to an IPv4-only container because the userland proxy is disabled")
                                }
                                if len(c.HostIP) == 0 && defHostIP.To4() == nil {
                                        log.G(ctx).WithFields(log.Fields{
                                                "mapping": c,
                                                "default": defHostIP,
                                        }).Info("Cannot map from default host binding address to an IPv4-only container because the userland proxy is disabled")
                                }
                        } else {
                                containerIP = containerIPv4
                        }
                }
                if add6 {
                        if bindingIPv6, ok := configurePortBindingIPv6(ctx, disableNAT6, c, containerIP, defHostIP); ok {
                                reqs = append(reqs, bindingIPv6)
                        }
                }
        }
        slices.SortFunc(reqs, func(a, b portmapperapi.PortBindingReq) int {
                return a.Compare(b)
        })
        return reqs
}

// needSamePort returns true iff a and b only differ in the host IP address,
// meaning they should be allocated the same host port (so that, if v4/v6
// addresses are returned in a DNS response or similar, clients can bind without
// needing to adjust the port number depending on which address is used).
func needSamePort(a, b portmapperapi.PortBindingReq) bool {
        return a.Port == b.Port &&
                a.Proto == b.Proto &&
                a.HostPort == b.HostPort &&
                a.HostPortEnd == b.HostPortEnd
}

// mergeChildHostIPs take a slice of PortBinding and returns a slice of
// types.PortBinding, where the HostIP in each of the results has the
// value of ChildHostIP from the input (if present).
func mergeChildHostIPs(pbs []portmapperapi.PortBinding) []types.PortBinding {
        res := make([]types.PortBinding, 0, len(pbs))
        for _, b := range pbs {
                pb := b.PortBinding
                if b.ChildHostIP != nil {
                        pb.HostIP = b.ChildHostIP
                }
                res = append(res, pb)
        }
        return res
}

// configurePortBindingIPv4 returns a new port binding with the HostIP field
// populated and true, if a binding is required. Else, false and an empty
// binding.
func configurePortBindingIPv4(
        ctx context.Context,
        disableNAT bool,
        bnd portmapperapi.PortBindingReq,
        containerIPv4,
        defHostIP net.IP,
) (portmapperapi.PortBindingReq, bool) {
        if len(containerIPv4) == 0 {
                return portmapperapi.PortBindingReq{}, false
        }
        if len(bnd.HostIP) > 0 && bnd.HostIP.To4() == nil {
                // The mapping is explicitly IPv6.
                return portmapperapi.PortBindingReq{}, false
        }
        // If there's no host address, use the default.
        if len(bnd.HostIP) == 0 {
                if defHostIP.To4() == nil {
                        // The default binding address is IPv6.
                        return portmapperapi.PortBindingReq{}, false
                }
                // The default binding IP is an IPv4 address, use it - unless NAT is disabled,
                // in which case it's not possible to bind to a specific host address (the port
                // mapping only opens the container's port for direct routing).
                if disableNAT {
                        bnd.HostIP = net.IPv4zero
                } else {
                        bnd.HostIP = defHostIP
                }
        }

        if disableNAT && len(bnd.HostIP) != 0 && !bnd.HostIP.Equal(net.IPv4zero) {
                // Ignore the default binding when nat is disabled - it may have been set
                // up for IPv6 if nat is enabled there.
                // Don't treat this as an error because, as networks are connected/disconnected,
                // the container's gateway endpoint might change to a network where this config
                // makes more sense.
                log.G(ctx).WithFields(log.Fields{"mapping": bnd}).Info(
                        "Using address 0.0.0.0 because NAT is disabled")
                bnd.HostIP = net.IPv4zero
        }

        // Unmap the addresses if they're IPv4-mapped IPv6.
        bnd.HostIP = bnd.HostIP.To4()
        bnd.IP = containerIPv4.To4()
        bnd.Mapper = "nat"
        if disableNAT {
                bnd.Mapper = "routed"
        }
        return bnd, true
}

// configurePortBindingIPv6 returns a new port binding with the HostIP field
// populated and true, if a binding is required. Else, false and an empty
// binding.
func configurePortBindingIPv6(
        ctx context.Context,
        disableNAT bool,
        bnd portmapperapi.PortBindingReq,
        containerIP, defHostIP net.IP,
) (portmapperapi.PortBindingReq, bool) {
        if containerIP == nil {
                return portmapperapi.PortBindingReq{}, false
        }
        if len(bnd.HostIP) > 0 && bnd.HostIP.To4() != nil {
                // The mapping is explicitly IPv4.
                return portmapperapi.PortBindingReq{}, false
        }

        // If there's no host address, use the default.
        if len(bnd.HostIP) == 0 {
                if defHostIP.Equal(net.IPv4zero) {
                        if !netutils.IsV6Listenable() {
                                // No implicit binding if the host has no IPv6 support.
                                return portmapperapi.PortBindingReq{}, false
                        }
                        // Implicit binding to "::", no explicit HostIP and the default is 0.0.0.0
                        bnd.HostIP = net.IPv6zero
                } else if defHostIP.To4() == nil {
                        // The default binding IP is an IPv6 address, use it - unless NAT is disabled, in
                        // which case it's not possible to bind to a specific host address (the port
                        // mapping only opens the container's port for direct routing).
                        if disableNAT {
                                bnd.HostIP = net.IPv6zero
                        } else {
                                bnd.HostIP = defHostIP
                        }
                } else {
                        // The default binding IP is an IPv4 address, nothing to do here.
                        return portmapperapi.PortBindingReq{}, false
                }
        }

        if disableNAT && len(bnd.HostIP) != 0 && !bnd.HostIP.Equal(net.IPv6zero) {
                // Ignore the default binding when nat is disabled - it may have been set
                // up for IPv4 if nat is enabled there.
                // Don't treat this as an error because, as networks are connected/disconnected,
                // the container's gateway endpoint might change to a network where this config
                // makes more sense.
                log.G(ctx).WithFields(log.Fields{"mapping": bnd}).Info(
                        "Using address [::] because NAT is disabled")
                bnd.HostIP = net.IPv6zero
        }

        bnd.IP = containerIP
        bnd.Mapper = "nat"
        if disableNAT {
                bnd.Mapper = "routed"
        }
        return bnd, true
}

// releasePorts attempts to release all port bindings, does not stop on failure
func (n *bridgeNetwork) releasePorts(ep *bridgeEndpoint) error {
        n.Lock()
        pbs := ep.portMapping
        ep.portMapping = nil
        ep.portBindingState = portBindingMode{}
        n.Unlock()

        return n.unmapPBs(context.TODO(), pbs)
}

func (n *bridgeNetwork) unmapPBs(ctx context.Context, bindings []portmapperapi.PortBinding) error {
        pms := n.portMappers()

        var errs []error
        for _, b := range bindings {
                pm, err := pms.Get(b.Mapper)
                if err != nil {
                        errs = append(errs, fmt.Errorf("unmapping port binding %s: %w", b.PortBinding, err))
                        continue
                }

                if err := pm.UnmapPorts(ctx, []portmapperapi.PortBinding{b}, n.firewallerNetwork); err != nil {
                        errs = append(errs, fmt.Errorf("unmapping port binding %s: %w", b.PortBinding, err))
                }
        }

        return errors.Join(errs...)
}

func (n *bridgeNetwork) reapplyPerPortIptables() {
        n.Lock()
        var allPBs []portmapperapi.PortBinding
        for _, ep := range n.endpoints {
                allPBs = append(allPBs, ep.portMapping...)
        }
        n.Unlock()

        if err := n.firewallerNetwork.AddPorts(context.Background(), mergeChildHostIPs(allPBs)); err != nil {
                log.G(context.TODO()).Warnf("Failed to reconfigure NAT: %s", err)
        }
}

//go:build linux

package bridge

import (
        "context"

        "github.com/docker/docker/internal/otelutil"
        "go.opentelemetry.io/otel"
)

type setupStep struct {
        name string
        fn   stepFn
}

type stepFn func(*networkConfiguration, *bridgeInterface) error

type bridgeSetup struct {
        config *networkConfiguration
        bridge *bridgeInterface
        steps  []setupStep
}

func newBridgeSetup(c *networkConfiguration, i *bridgeInterface) *bridgeSetup {
        return &bridgeSetup{config: c, bridge: i}
}

func (b *bridgeSetup) apply(ctx context.Context) error {
        for _, step := range b.steps {
                ctx, span := otel.Tracer("").Start(ctx, spanPrefix+"."+step.name)
                _ = ctx // To avoid unused variable error while making sure that if / when setupStep starts taking a context, the right value will be used.

                err := step.fn(b.config, b.bridge)
                otelutil.RecordStatus(span, err)
                span.End()

                if err != nil {
                        return err
                }
        }
        return nil
}

func (b *bridgeSetup) queueStep(name string, fn stepFn) {
        b.steps = append(b.steps, setupStep{name, fn})
}

//go:build linux

package bridge

import (
        "context"
        "errors"
        "fmt"
        "os"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/modprobe"
)

// setupIPv4BridgeNetFiltering checks whether IPv4 forwarding is enabled and, if
// it is, sets kernel param "bridge-nf-call-iptables=1" so that packets traversing
// the bridge are filtered.
func setupIPv4BridgeNetFiltering(*networkConfiguration, *bridgeInterface) error {
        if enabled, err := getKernelBoolParam("/proc/sys/net/ipv4/ip_forward"); err != nil {
                log.G(context.TODO()).Warnf("failed to check IPv4 forwarding: %v", err)
                return nil
        } else if enabled {
                return enableBridgeNetFiltering("/proc/sys/net/bridge/bridge-nf-call-iptables")
        }
        return nil
}

// setupIPv6BridgeNetFiltering checks whether IPv6 forwarding is enabled for the
// bridge and, if it is, sets kernel param "bridge-nf-call-ip6tables=1" so that
// packets traversing the bridge are filtered.
func setupIPv6BridgeNetFiltering(config *networkConfiguration, _ *bridgeInterface) error {
        if !config.EnableIPv6 {
                return nil
        }
        if config.BridgeName == "" {
                return errors.New("unable to check IPv6 forwarding, no bridge name specified")
        }
        if enabled, err := getKernelBoolParam("/proc/sys/net/ipv6/conf/" + config.BridgeName + "/forwarding"); err != nil {
                log.G(context.TODO()).Warnf("failed to check IPv6 forwarding: %v", err)
                return nil
        } else if enabled {
                return enableBridgeNetFiltering("/proc/sys/net/bridge/bridge-nf-call-ip6tables")
        }
        return nil
}

func loadBridgeNetFilterModule(fullPath string) error {
        // br_netfilter implicitly loads bridge module upon modprobe
        return modprobe.LoadModules(context.TODO(), func() error {
                _, err := os.Stat(fullPath)
                return err
        }, "br_netfilter")
}

// Enable bridge net filtering if not already enabled. See GitHub issue #11404
func enableBridgeNetFiltering(nfParam string) (retErr error) {
        defer func() {
                if retErr != nil {
                        if os.Getenv("DOCKER_IGNORE_BR_NETFILTER_ERROR") == "1" {
                                log.G(context.TODO()).WithError(retErr).Warnf("Continuing without enabling br_netfilter")
                                retErr = nil
                                return
                        }
                        retErr = fmt.Errorf("%w: set environment variable DOCKER_IGNORE_BR_NETFILTER_ERROR=1 to ignore", retErr)
                }
        }()

        if err := loadBridgeNetFilterModule(nfParam); err != nil {
                return fmt.Errorf("cannot restrict inter-container communication or run without the userland proxy: %w", err)
        }
        enabled, err := getKernelBoolParam(nfParam)
        if err != nil {
                var pathErr *os.PathError
                if errors.As(err, &pathErr) && errors.Is(pathErr, syscall.ENOENT) {
                        if isRunningInContainer() {
                                log.G(context.TODO()).WithError(err).Warnf("running inside docker container, ignoring missing kernel params")
                                return nil
                        }
                        err = errors.New("ensure that the br_netfilter kernel module is loaded")
                }
                return fmt.Errorf("cannot restrict inter-container communication or run without the userland proxy: %v", err)
        }
        if !enabled {
                return os.WriteFile(nfParam, []byte{'1', '\n'}, 0o644)
        }
        return nil
}

// Gets the value of the kernel parameters located at the given path
func getKernelBoolParam(path string) (bool, error) {
        line, err := os.ReadFile(path)
        if err != nil {
                return false, err
        }
        return len(line) > 0 && line[0] == '1', nil
}

func isRunningInContainer() bool {
        _, err := os.Stat("/.dockerenv")
        return !os.IsNotExist(err)
}

package bridge

import (
        "context"
        "fmt"
        "os"
        "path/filepath"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/errdefs"
        "github.com/vishvananda/netlink"
)

// SetupDevice create a new bridge interface/
func setupDevice(config *networkConfiguration, i *bridgeInterface) error {
        if config.BridgeName != DefaultBridgeName && config.DefaultBridge {
                // TODO(thaJeztah): should this be an [errdefs.ErrInvalidParameter], not an [errdefs.ErrForbidden]?
                return errdefs.Forbidden(fmt.Errorf("bridge device with non default name %s must be created manually", config.BridgeName))
        }

        // Set the bridgeInterface netlink.Bridge.
        i.Link = &netlink.Bridge{
                LinkAttrs: netlink.LinkAttrs{
                        Name: config.BridgeName,
                },
        }

        // Set the bridge's MAC address. Requires kernel version 3.3 or up.
        hwAddr := netutils.GenerateRandomMAC()
        i.Link.Attrs().HardwareAddr = hwAddr
        log.G(context.TODO()).Debugf("Setting bridge mac address to %s", hwAddr)

        if err := i.nlh.LinkAdd(i.Link); err != nil {
                log.G(context.TODO()).WithError(err).Errorf("Failed to create bridge %s via netlink", config.BridgeName)
                return err
        }

        return nil
}

func setupMTU(config *networkConfiguration, i *bridgeInterface) error {
        if err := i.nlh.LinkSetMTU(i.Link, config.Mtu); err != nil {
                log.G(context.TODO()).WithError(err).Errorf("Failed to set bridge MTU %s via netlink", config.BridgeName)
                return err
        }
        return nil
}

func setupDefaultSysctl(config *networkConfiguration, i *bridgeInterface) error {
        // Disable IPv6 router advertisements originating on the bridge
        sysPath := filepath.Join("/proc/sys/net/ipv6/conf/", config.BridgeName, "accept_ra")
        if _, err := os.Stat(sysPath); err != nil {
                log.G(context.TODO()).
                        WithField("bridge", config.BridgeName).
                        WithField("syspath", sysPath).
                        Info("failed to read ipv6 net.ipv6.conf.<bridge>.accept_ra")
                return nil
        }
        if err := os.WriteFile(sysPath, []byte{'0', '\n'}, 0o644); err != nil {
                log.G(context.TODO()).WithError(err).Warn("unable to disable IPv6 router advertisement")
        }
        return nil
}

// SetupDeviceUp ups the given bridge interface.
func setupDeviceUp(config *networkConfiguration, i *bridgeInterface) error {
        err := i.nlh.LinkSetUp(i.Link)
        if err != nil {
                return fmt.Errorf("Failed to set link up for %s: %v", config.BridgeName, err)
        }

        // Attempt to update the bridge interface to refresh the flags status,
        // ignoring any failure to do so.
        if lnk, err := i.nlh.LinkByName(config.BridgeName); err == nil {
                i.Link = lnk
        } else {
                log.G(context.TODO()).Warnf("Failed to retrieve link for interface (%s): %v", config.BridgeName, err)
        }
        return nil
}

//go:build linux

package bridge

import (
        "context"
        "fmt"
        "os"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge/internal/firewaller"
)

const (
        ipv4ForwardConf        = "/proc/sys/net/ipv4/ip_forward"
        ipv6ForwardConfDefault = "/proc/sys/net/ipv6/conf/default/forwarding"
        ipv6ForwardConfAll     = "/proc/sys/net/ipv6/conf/all/forwarding"
)

func setupIPv4Forwarding(fw firewaller.Firewaller, wantFilterForwardDrop bool) (retErr error) {
        changed, err := configureIPForwarding(ipv4ForwardConf, '1')
        if err != nil {
                return err
        }
        if changed {
                defer func() {
                        if retErr != nil {
                                if _, err := configureIPForwarding(ipv4ForwardConf, '0'); err != nil {
                                        log.G(context.TODO()).WithError(err).Error("Cannot disable IPv4 forwarding")
                                }
                        }
                }()
        }

        // When enabling ip_forward set the default policy on forward chain to drop.
        if changed && wantFilterForwardDrop {
                if err := fw.FilterForwardDrop(context.TODO(), firewaller.IPv4); err != nil {
                        return err
                }
        }
        return nil
}

func setupIPv6Forwarding(fw firewaller.Firewaller, wantFilterForwardDrop bool) (retErr error) {
        // Set IPv6 default.forwarding, if needed.
        // FIXME(robmry) - is it necessary to set this, setting "all" (below) does the job?
        changedDef, err := configureIPForwarding(ipv6ForwardConfDefault, '1')
        if err != nil {
                return err
        }
        if changedDef {
                defer func() {
                        if retErr != nil {
                                if _, err := configureIPForwarding(ipv6ForwardConfDefault, '0'); err != nil {
                                        log.G(context.TODO()).WithError(err).Error("Cannot disable IPv6 default.forwarding")
                                }
                        }
                }()
        }

        // Set IPv6 all.forwarding, if needed.
        changedAll, err := configureIPForwarding(ipv6ForwardConfAll, '1')
        if err != nil {
                return err
        }
        if changedAll {
                defer func() {
                        if retErr != nil {
                                if _, err := configureIPForwarding(ipv6ForwardConfAll, '0'); err != nil {
                                        log.G(context.TODO()).WithError(err).Error("Cannot disable IPv6 all.forwarding")
                                }
                        }
                }()
        }

        if (changedAll || changedDef) && wantFilterForwardDrop {
                if err := fw.FilterForwardDrop(context.TODO(), firewaller.IPv6); err != nil {
                        return err
                }
        }

        return nil
}

func configureIPForwarding(file string, val byte) (changed bool, _ error) {
        data, err := os.ReadFile(file)
        if err != nil || len(data) == 0 {
                return false, fmt.Errorf("cannot read IP forwarding setup from '%s': %w", file, err)
        }
        if len(data) == 0 {
                return false, fmt.Errorf("cannot read IP forwarding setup from '%s': 0 bytes", file)
        }
        if data[0] == val {
                return false, nil
        }
        if err := os.WriteFile(file, []byte{val, '\n'}, 0o644); err != nil {
                return false, fmt.Errorf("failed to set IP forwarding '%s' = '%c': %w", file, val, err)
        }
        return true, nil
}

package bridge

import (
        "context"
        "errors"
        "fmt"
        "net"
        "os"
        "path/filepath"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/vishvananda/netlink"
)

func selectIPv4Address(addresses []netlink.Addr, selector *net.IPNet) (netlink.Addr, error) {
        if len(addresses) == 0 {
                return netlink.Addr{}, errors.New("unable to select an address as the address pool is empty")
        }
        if selector != nil {
                for _, addr := range addresses {
                        if selector.Contains(addr.IP) {
                                return addr, nil
                        }
                }
        }
        return addresses[0], nil
}

func setupBridgeIPv4(config *networkConfiguration, i *bridgeInterface) error {
        // TODO(aker): the bridge driver panics if its bridgeIPv4 field isn't set. Once bridge subnet and bridge IP address
        //             are decoupled, we should assign it only when it's really needed.
        i.bridgeIPv4 = config.AddressIPv4

        if !config.InhibitIPv4 && !config.GwModeIPv4.isolated() {
                addrv4List, err := i.addresses(netlink.FAMILY_V4)
                if err != nil {
                        return fmt.Errorf("failed to retrieve bridge interface addresses: %v", err)
                }

                addrv4, _ := selectIPv4Address(addrv4List, config.AddressIPv4)

                if !types.CompareIPNet(addrv4.IPNet, config.AddressIPv4) {
                        if addrv4.IPNet != nil {
                                if err := i.nlh.AddrDel(i.Link, &addrv4); err != nil {
                                        return fmt.Errorf("failed to remove current ip address from bridge: %v", err)
                                }
                        }
                        log.G(context.TODO()).Debugf("Assigning address to bridge interface %s: %s", config.BridgeName, config.AddressIPv4)
                        if err := i.nlh.AddrAdd(i.Link, &netlink.Addr{IPNet: config.AddressIPv4}); err != nil {
                                return fmt.Errorf("failed to add IPv4 address %s to bridge: %v", config.AddressIPv4, err)
                        }
                }
        }

        if !config.Internal {
                // Store the default gateway
                i.gatewayIPv4 = config.AddressIPv4.IP
        }

        return nil
}

func setupGatewayIPv4(config *networkConfiguration, i *bridgeInterface) error {
        if !i.bridgeIPv4.Contains(config.DefaultGatewayIPv4) {
                return errInvalidGateway
        }
        if config.Internal {
                return types.InvalidParameterErrorf("no gateway can be set on an internal bridge network")
        }

        // Store requested default gateway
        i.gatewayIPv4 = config.DefaultGatewayIPv4

        return nil
}

func setupLoopbackAddressesRouting(config *networkConfiguration, i *bridgeInterface) error {
        sysPath := filepath.Join("/proc/sys/net/ipv4/conf", config.BridgeName, "route_localnet")
        ipv4LoRoutingData, err := os.ReadFile(sysPath)
        if err != nil {
                return fmt.Errorf("Cannot read IPv4 local routing setup: %v", err)
        }
        // Enable loopback addresses routing only if it isn't already enabled
        if ipv4LoRoutingData[0] != '1' {
                if err := os.WriteFile(sysPath, []byte{'1', '\n'}, 0o644); err != nil {
                        return fmt.Errorf("Unable to enable local routing for hairpin mode: %v", err)
                }
        }
        return nil
}

package bridge

import (
        "fmt"
        "net/netip"
        "os"
)

// Standard link local prefix
var linkLocalPrefix = netip.MustParsePrefix("fe80::/64")

func setupBridgeIPv6(config *networkConfiguration, i *bridgeInterface) error {
        procFile := "/proc/sys/net/ipv6/conf/" + config.BridgeName + "/disable_ipv6"
        ipv6BridgeData, err := os.ReadFile(procFile)
        if err != nil {
                return fmt.Errorf("Cannot read IPv6 setup for bridge %v: %v", config.BridgeName, err)
        }

        // Disable IPv6 on the bridge if the network is "isolated", so that it
        // doesn't get a kernel-assigned LL address (or any other IPv6 address).
        if config.GwModeIPv6.isolated() {
                if ipv6BridgeData[0] != '1' {
                        if err := os.WriteFile(procFile, []byte{'1', '\n'}, 0o644); err != nil {
                                return fmt.Errorf("unable to disable IPv6 addresses on bridge for gateway mode 'isolated': %v", err)
                        }
                }
                i.bridgeIPv6 = config.AddressIPv6
                return nil
        }

        // Enable IPv6 on the bridge only if it isn't already enabled
        if ipv6BridgeData[0] != '0' {
                if err := os.WriteFile(procFile, []byte{'0', '\n'}, 0o644); err != nil {
                        return fmt.Errorf("Unable to enable IPv6 addresses on bridge: %v", err)
                }
        }

        // Remove unwanted addresses from the bridge, add required addresses, and assign
        // values to "i.bridgeIPv6", "i.gatewayIPv6".
        if err := i.programIPv6Addresses(config); err != nil {
                return err
        }
        return nil
}

func setupGatewayIPv6(config *networkConfiguration, i *bridgeInterface) error {
        if !config.AddressIPv6.Contains(config.DefaultGatewayIPv6) {
                return errInvalidGateway
        }

        // Store requested default gateway
        i.gatewayIPv6 = config.DefaultGatewayIPv6

        return nil
}

package bridge

import (
        "errors"
        "fmt"
        "strings"

        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/vishvananda/netlink"
)

// setupVerifyAndReconcileIPv4 checks what IPv4 addresses the given i interface has
// and ensures that they match the passed network config.
func setupVerifyAndReconcileIPv4(config *networkConfiguration, i *bridgeInterface) error {
        if config.GwModeIPv4.isolated() {
                return nil
        }
        // Fetch a slice of IPv4 addresses from the bridge.
        addrsv4, err := i.addresses(netlink.FAMILY_V4)
        if err != nil {
                return fmt.Errorf("Failed to verify ip addresses: %v", err)
        }

        addrv4, _ := selectIPv4Address(addrsv4, config.AddressIPv4)

        // Verify that the bridge has an IPv4 address.
        if addrv4.IPNet == nil {
                return errors.New("bridge has no IPv4 address configured")
        }

        // Verify that the bridge IPv4 address matches the requested configuration.
        if config.AddressIPv4 != nil && !addrv4.IP.Equal(config.AddressIPv4.IP) {
                return fmt.Errorf("bridge IPv4 (%s) does not match requested configuration %s", addrv4.IP, config.AddressIPv4.IP)
        }

        return nil
}

func bridgeInterfaceExists(name string) (bool, error) {
        nlh := ns.NlHandle()
        link, err := nlh.LinkByName(name)
        if err != nil {
                if strings.Contains(err.Error(), "Link not found") {
                        return false, nil
                }
                return false, fmt.Errorf("failed to check bridge interface existence: %v", err)
        }

        if link.Type() == "bridge" {
                return true, nil
        }
        return false, fmt.Errorf("existing interface %s is not a bridge", name)
}

package bridge

import (
        "context"
        "os"

        "github.com/containerd/log"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/pkg/errors"
        "github.com/vishvananda/netlink"
)

// Path to the executable installed in Linux under WSL2 that reports on
// WSL config. https://github.com/microsoft/WSL/releases/tag/2.0.4
// Can be modified by tests.
var wslinfoPath = "/usr/bin/wslinfo"

// isRunningUnderWSL2MirroredMode returns true if the host Linux appears to be
// running under Windows WSL2 with mirrored mode networking. If a loopback0
// device exists, and there's an executable at /usr/bin/wslinfo, infer that
// this is WSL2 with mirrored networking. ("wslinfo --networking-mode" reports
// "mirrored", but applying the workaround for WSL2's loopback device when it's
// not needed is low risk, compared with executing wslinfo with dockerd's
// elevated permissions.)
func isRunningUnderWSL2MirroredMode(ctx context.Context) bool {
        if _, err := nlwrap.LinkByName("loopback0"); err != nil {
                if !errors.As(err, &netlink.LinkNotFoundError{}) {
                        log.G(ctx).WithError(err).Warn("Failed to check for WSL interface")
                }
                return false
        }
        stat, err := os.Stat(wslinfoPath)
        if err != nil {
                return false
        }
        return stat.Mode().IsRegular() && (stat.Mode().Perm()&0o111) != 0
}

package host

import (
        "context"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const NetworkType = "host"

type driver struct {
        network string
        sync.Mutex
}

func Register(r driverapi.Registerer) error {
        return r.RegisterDriver(NetworkType, &driver{}, driverapi.Capability{
                DataScope:         scope.Local,
                ConnectivityScope: scope.Local,
        })
}

func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
        return nil, types.NotImplementedErrorf("not implemented")
}

func (d *driver) NetworkFree(id string) error {
        return types.NotImplementedErrorf("not implemented")
}

func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key string, value []byte) {
}

func (d *driver) DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string) {
        return "", nil
}

func (d *driver) CreateNetwork(ctx context.Context, id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
        d.Lock()
        defer d.Unlock()

        if d.network != "" {
                return types.ForbiddenErrorf("only one instance of %q network is allowed", NetworkType)
        }

        d.network = id

        return nil
}

func (d *driver) DeleteNetwork(nid string) error {
        return types.ForbiddenErrorf("network of type %q cannot be deleted", NetworkType)
}

func (d *driver) CreateEndpoint(_ context.Context, nid, eid string, ifInfo driverapi.InterfaceInfo, epOptions map[string]interface{}) error {
        return nil
}

func (d *driver) DeleteEndpoint(nid, eid string) error {
        return nil
}

func (d *driver) EndpointOperInfo(nid, eid string) (map[string]interface{}, error) {
        return make(map[string]interface{}), nil
}

// Join method is invoked when a Sandbox is attached to an endpoint.
func (d *driver) Join(_ context.Context, nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, _, _ map[string]interface{}) error {
        return nil
}

// Leave method is invoked when a Sandbox detaches from an endpoint.
func (d *driver) Leave(nid, eid string) error {
        return nil
}

func (d *driver) Type() string {
        return NetworkType
}

func (d *driver) IsBuiltIn() bool {
        return true
}

//go:build linux

package ipvlan

import (
        "net"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        containerVethPrefix = "eth"
        vethPrefix          = "veth"
        vethLen             = len(vethPrefix) + 7

        NetworkType   = "ipvlan"      // driver type name
        parentOpt     = "parent"      // parent interface -o parent
        driverModeOpt = "ipvlan_mode" // mode -o ipvlan_mode
        driverFlagOpt = "ipvlan_flag" // flag -o ipvlan_flag

        modeL2  = "l2"  // ipvlan L2 mode (default)
        modeL3  = "l3"  // ipvlan L3 mode
        modeL3S = "l3s" // ipvlan L3S mode

        flagBridge  = "bridge"  // ipvlan flag bridge (default)
        flagPrivate = "private" // ipvlan flag private
        flagVepa    = "vepa"    // ipvlan flag vepa
)

type endpointTable map[string]*endpoint

type networkTable map[string]*network

type driver struct {
        networks networkTable
        sync.Once
        sync.Mutex
        store *datastore.Store
}

type endpoint struct {
        id       string
        nid      string
        mac      net.HardwareAddr
        addr     *net.IPNet
        addrv6   *net.IPNet
        srcName  string
        dbIndex  uint64
        dbExists bool
}

type network struct {
        id        string
        endpoints endpointTable
        driver    *driver
        config    *configuration
        sync.Mutex
}

// Register initializes and registers the libnetwork ipvlan driver.
func Register(r driverapi.Registerer, store *datastore.Store, config map[string]interface{}) error {
        d := &driver{
                store:    store,
                networks: networkTable{},
        }
        if err := d.initStore(); err != nil {
                return err
        }
        return r.RegisterDriver(NetworkType, d, driverapi.Capability{
                DataScope:         scope.Local,
                ConnectivityScope: scope.Global,
        })
}

func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
        return nil, types.NotImplementedErrorf("not implemented")
}

func (d *driver) NetworkFree(id string) error {
        return types.NotImplementedErrorf("not implemented")
}

func (d *driver) EndpointOperInfo(nid, eid string) (map[string]interface{}, error) {
        return make(map[string]interface{}), nil
}

func (d *driver) Type() string {
        return NetworkType
}

func (d *driver) IsBuiltIn() bool {
        return true
}

func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key string, value []byte) {
}

func (d *driver) DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string) {
        return "", nil
}

//go:build linux

package ipvlan

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
)

// CreateEndpoint assigns the mac, ip and endpoint id for the new container
func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo driverapi.InterfaceInfo, epOptions map[string]interface{}) error {
        if err := validateID(nid, eid); err != nil {
                return err
        }
        n, err := d.getNetwork(nid)
        if err != nil {
                return errdefs.System(fmt.Errorf("network id %q not found", nid))
        }
        if ifInfo.MacAddress() != nil {
                return errors.New("ipvlan interfaces do not support custom mac address assignment")
        }
        ep := &endpoint{
                id:     eid,
                nid:    nid,
                addr:   ifInfo.Address(),
                addrv6: ifInfo.AddressIPv6(),
        }
        // disallow port mapping -p
        if opt, ok := epOptions[netlabel.PortMap]; ok {
                if _, ok := opt.([]types.PortBinding); ok {
                        if len(opt.([]types.PortBinding)) > 0 {
                                log.G(ctx).Warnf("ipvlan driver does not support port mappings")
                        }
                }
        }
        // disallow port exposure --expose
        if opt, ok := epOptions[netlabel.ExposedPorts]; ok {
                if _, ok := opt.([]types.TransportPort); ok {
                        if len(opt.([]types.TransportPort)) > 0 {
                                log.G(ctx).Warnf("ipvlan driver does not support port exposures")
                        }
                }
        }

        if err := d.storeUpdate(ep); err != nil {
                return fmt.Errorf("failed to save ipvlan endpoint %.7s to store: %v", ep.id, err)
        }

        n.addEndpoint(ep)

        return nil
}

// DeleteEndpoint remove the endpoint and associated netlink interface
func (d *driver) DeleteEndpoint(nid, eid string) error {
        if err := validateID(nid, eid); err != nil {
                return err
        }
        n := d.network(nid)
        if n == nil {
                return fmt.Errorf("network id %q not found", nid)
        }
        ep := n.endpoint(eid)
        if ep == nil {
                return fmt.Errorf("endpoint id %q not found", eid)
        }
        if link, err := ns.NlHandle().LinkByName(ep.srcName); err == nil {
                if err := ns.NlHandle().LinkDel(link); err != nil {
                        log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.srcName, ep.id)
                }
        }

        if err := d.storeDelete(ep); err != nil {
                log.G(context.TODO()).Warnf("Failed to remove ipvlan endpoint %.7s from store: %v", ep.id, err)
        }
        n.deleteEndpoint(ep.id)
        return nil
}

//go:build linux

package ipvlan

import (
        "context"
        "fmt"
        "net"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/types"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

type staticRoute struct {
        Destination *net.IPNet
        RouteType   int
        NextHop     net.IP
}

const (
        defaultV4RouteCidr = "0.0.0.0/0"
        defaultV6RouteCidr = "::/0"
)

// Join method is invoked when a Sandbox is attached to an endpoint.
func (d *driver) Join(ctx context.Context, nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, epOpts, _ map[string]interface{}) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.drivers.ipvlan.Join", trace.WithAttributes(
                attribute.String("nid", nid),
                attribute.String("eid", eid),
                attribute.String("sboxKey", sboxKey)))
        defer span.End()

        n, err := d.getNetwork(nid)
        if err != nil {
                return err
        }
        endpoint := n.endpoint(eid)
        if endpoint == nil {
                return fmt.Errorf("could not find endpoint with id %s", eid)
        }
        // generate a name for the iface that will be renamed to eth0 in the sbox
        containerIfName, err := netutils.GenerateIfaceName(ns.NlHandle(), vethPrefix, vethLen)
        if err != nil {
                return fmt.Errorf("error generating an interface name: %v", err)
        }
        // create the netlink ipvlan interface
        vethName, err := createIPVlan(containerIfName, n.config.Parent, n.config.IpvlanMode, n.config.IpvlanFlag)
        if err != nil {
                return err
        }
        // bind the generated iface name to the endpoint
        endpoint.srcName = vethName
        ep := n.endpoint(eid)
        if ep == nil {
                return fmt.Errorf("could not find endpoint with id %s", eid)
        }
        if !n.config.Internal {
                switch n.config.IpvlanMode {
                case modeL3, modeL3S:
                        // disable gateway services to add a default gw using dev eth0 only
                        jinfo.DisableGatewayService()
                        if ep.addr != nil {
                                defaultRoute, err := ifaceGateway(defaultV4RouteCidr)
                                if err != nil {
                                        return err
                                }
                                if err := jinfo.AddStaticRoute(defaultRoute.Destination, defaultRoute.RouteType, defaultRoute.NextHop); err != nil {
                                        return fmt.Errorf("failed to set an ipvlan l3/l3s mode ipv4 default gateway: %v", err)
                                }
                                log.G(ctx).Debugf("Ipvlan Endpoint Joined with IPv4_Addr: %s, Ipvlan_Mode: %s, Parent: %s",
                                        ep.addr.IP.String(), n.config.IpvlanMode, n.config.Parent)
                        }
                        // If the endpoint has a v6 address, set a v6 default route
                        if ep.addrv6 != nil {
                                default6Route, err := ifaceGateway(defaultV6RouteCidr)
                                if err != nil {
                                        return err
                                }
                                if err = jinfo.AddStaticRoute(default6Route.Destination, default6Route.RouteType, default6Route.NextHop); err != nil {
                                        return fmt.Errorf("failed to set an ipvlan l3/l3s mode ipv6 default gateway: %v", err)
                                }
                                log.G(ctx).Debugf("Ipvlan Endpoint Joined with IPv6_Addr: %s, Ipvlan_Mode: %s, Parent: %s",
                                        ep.addrv6.IP.String(), n.config.IpvlanMode, n.config.Parent)
                        }
                case modeL2:
                        // parse and correlate the endpoint v4 address with the available v4 subnets
                        if len(n.config.Ipv4Subnets) > 0 {
                                s := n.getSubnetforIPv4(ep.addr)
                                if s == nil {
                                        return fmt.Errorf("could not find a valid ipv4 subnet for endpoint %s", eid)
                                }
                                v4gw, _, err := net.ParseCIDR(s.GwIP)
                                if err != nil {
                                        return fmt.Errorf("gateway %s is not a valid ipv4 address: %v", s.GwIP, err)
                                }
                                err = jinfo.SetGateway(v4gw)
                                if err != nil {
                                        return err
                                }
                                log.G(ctx).Debugf("Ipvlan Endpoint Joined with IPv4_Addr: %s, Gateway: %s, Ipvlan_Mode: %s, Parent: %s",
                                        ep.addr.IP.String(), v4gw.String(), n.config.IpvlanMode, n.config.Parent)
                        }
                        // parse and correlate the endpoint v6 address with the available v6 subnets
                        if len(n.config.Ipv6Subnets) > 0 {
                                s := n.getSubnetforIPv6(ep.addrv6)
                                if s == nil {
                                        return fmt.Errorf("could not find a valid ipv6 subnet for endpoint %s", eid)
                                }
                                v6gw, _, err := net.ParseCIDR(s.GwIP)
                                if err != nil {
                                        return fmt.Errorf("gateway %s is not a valid ipv6 address: %v", s.GwIP, err)
                                }
                                err = jinfo.SetGatewayIPv6(v6gw)
                                if err != nil {
                                        return err
                                }
                                log.G(ctx).Debugf("Ipvlan Endpoint Joined with IPv6_Addr: %s, Gateway: %s, Ipvlan_Mode: %s, Parent: %s",
                                        ep.addrv6.IP.String(), v6gw.String(), n.config.IpvlanMode, n.config.Parent)
                        }
                        if len(n.config.Ipv4Subnets) == 0 && len(n.config.Ipv6Subnets) == 0 {
                                // With no addresses, don't need a gateway.
                                jinfo.DisableGatewayService()
                        }
                }
        } else {
                if len(n.config.Ipv4Subnets) > 0 {
                        log.G(ctx).Debugf("Ipvlan Endpoint Joined with IPv4_Addr: %s, IpVlan_Mode: %s, Parent: %s",
                                ep.addr.IP.String(), n.config.IpvlanMode, n.config.Parent)
                }
                if len(n.config.Ipv6Subnets) > 0 {
                        log.G(ctx).Debugf("Ipvlan Endpoint Joined with IPv6_Addr: %s IpVlan_Mode: %s, Parent: %s",
                                ep.addrv6.IP.String(), n.config.IpvlanMode, n.config.Parent)
                }
                // If n.config.Internal was set locally by the driver because there's no parent
                // interface, libnetwork doesn't know the network is internal. So, stop it from
                // adding a gateway endpoint.
                jinfo.DisableGatewayService()
        }
        iNames := jinfo.InterfaceName()
        err = iNames.SetNames(vethName, containerVethPrefix, netlabel.GetIfname(epOpts))
        if err != nil {
                return err
        }
        if err = d.storeUpdate(ep); err != nil {
                return fmt.Errorf("failed to save ipvlan endpoint %.7s to store: %v", ep.id, err)
        }

        return nil
}

// Leave method is invoked when a Sandbox detaches from an endpoint.
func (d *driver) Leave(nid, eid string) error {
        network, err := d.getNetwork(nid)
        if err != nil {
                return err
        }
        endpoint, err := network.getEndpoint(eid)
        if err != nil {
                return err
        }
        if endpoint == nil {
                return fmt.Errorf("could not find endpoint with id %s", eid)
        }

        return nil
}

// ifaceGateway returns a static route for either v4/v6 to be set to the container eth0
func ifaceGateway(dfNet string) (*staticRoute, error) {
        nh, dst, err := net.ParseCIDR(dfNet)
        if err != nil {
                return nil, fmt.Errorf("unable to parse default route %v", err)
        }
        defaultRoute := &staticRoute{
                Destination: dst,
                RouteType:   types.CONNECTED,
                NextHop:     nh,
        }

        return defaultRoute, nil
}

// getSubnetforIPv4 returns the ipv4 subnet to which the given IP belongs
func (n *network) getSubnetforIPv4(ip *net.IPNet) *ipSubnet {
        return getSubnetForIP(ip, n.config.Ipv4Subnets)
}

// getSubnetforIPv6 returns the ipv6 subnet to which the given IP belongs
func (n *network) getSubnetforIPv6(ip *net.IPNet) *ipSubnet {
        return getSubnetForIP(ip, n.config.Ipv6Subnets)
}

func getSubnetForIP(ip *net.IPNet, subnets []*ipSubnet) *ipSubnet {
        for _, s := range subnets {
                _, snet, err := net.ParseCIDR(s.SubnetIP)
                if err != nil {
                        return nil
                }
                // first check if the mask lengths are the same
                i, _ := snet.Mask.Size()
                j, _ := ip.Mask.Size()
                if i != j {
                        continue
                }
                if snet.Contains(ip.IP) {
                        return s
                }
        }

        return nil
}

//go:build linux

package ipvlan

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/options"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/pkg/parsers/kernel"
)

// CreateNetwork the network for the specified driver type
func (d *driver) CreateNetwork(ctx context.Context, nid string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
        kv, err := kernel.GetKernelVersion()
        if err != nil {
                return fmt.Errorf("failed to check kernel version for ipvlan driver support: %v", err)
        }
        // ensure Kernel version is >= v4.2 for ipvlan support
        if kv.Kernel < ipvlanKernelVer || (kv.Kernel == ipvlanKernelVer && kv.Major < ipvlanMajorVer) {
                return fmt.Errorf("kernel version failed to meet the minimum ipvlan kernel requirement of %d.%d, found %d.%d.%d",
                        ipvlanKernelVer, ipvlanMajorVer, kv.Kernel, kv.Major, kv.Minor)
        }
        // reject a null v4 network if ipv4 is required
        if v, ok := option[netlabel.EnableIPv4]; ok && v.(bool) {
                if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
                        return errdefs.InvalidParameter(errors.New("ipv4 pool is empty"))
                }
        }
        // reject a null v6 network if ipv6 is required
        if v, ok := option[netlabel.EnableIPv6]; ok && v.(bool) {
                if len(ipV6Data) == 0 || ipV6Data[0].Pool.String() == "::/0" {
                        return errdefs.InvalidParameter(errors.New("ipv6 pool is empty"))
                }
        }
        // parse and validate the config and bind to networkConfiguration
        config, err := parseNetworkOptions(nid, option)
        if err != nil {
                return err
        }
        config.processIPAM(ipV4Data, ipV6Data)

        // if parent interface not specified, create a dummy type link to use named dummy+net_id
        if config.Parent == "" {
                config.Parent = getDummyName(config.ID)
        }
        foundExisting, err := d.createNetwork(config)
        if err != nil {
                return err
        }

        if foundExisting {
                return types.InternalMaskableErrorf("restoring existing network %s", config.ID)
        }

        // update persistent db, rollback on fail
        err = d.storeUpdate(config)
        if err != nil {
                d.deleteNetwork(config.ID)
                log.G(context.TODO()).Debugf("encountered an error rolling back a network create for %s : %v", config.ID, err)
                return err
        }

        return nil
}

// createNetwork is used by new network callbacks and persistent network cache
func (d *driver) createNetwork(config *configuration) (bool, error) {
        foundExisting := false
        networkList := d.getNetworks()
        for _, nw := range networkList {
                if config.Parent == nw.config.Parent {
                        if config.ID != nw.config.ID {
                                return false, fmt.Errorf("network %s is already using parent interface %s",
                                        getDummyName(nw.config.ID), config.Parent)
                        }
                        log.G(context.TODO()).Debugf("Create Network for the same ID %s\n", config.ID)
                        foundExisting = true
                        break
                }
        }
        if !parentExists(config.Parent) {
                // Create a dummy link if a dummy name is set for parent
                if dummyName := getDummyName(config.ID); dummyName == config.Parent {
                        err := createDummyLink(config.Parent, dummyName)
                        if err != nil {
                                return false, err
                        }
                        config.CreatedSlaveLink = true

                        // notify the user in logs that they have limited communications
                        log.G(context.TODO()).Debugf("Empty -o parent= flags limit communications to other containers inside of network: %s",
                                config.Parent)
                } else {
                        // if the subinterface parent_iface.vlan_id checks do not pass, return err.
                        //  a valid example is 'eth0.10' for a parent iface 'eth0' with a vlan id '10'
                        err := createVlanLink(config.Parent)
                        if err != nil {
                                return false, err
                        }
                        // if driver created the networks slave link, record it for future deletion
                        config.CreatedSlaveLink = true
                }
        }
        if !foundExisting {
                n := &network{
                        id:        config.ID,
                        driver:    d,
                        endpoints: endpointTable{},
                        config:    config,
                }
                // add the network
                d.addNetwork(n)
        }

        return foundExisting, nil
}

func (d *driver) GetSkipGwAlloc(opts options.Generic) (ipv4, ipv6 bool, _ error) {
        cfg, err := parseNetworkOptions("dummy", opts)
        if err != nil {
                return false, false, err
        }
        // L3 ipvlans connect the default route to an interface, no gateway address is set up.
        switch cfg.IpvlanMode {
        case modeL3, modeL3S:
                return true, true, nil
        }
        // "--internal" networks don't need a gateway address.
        return cfg.Internal, cfg.Internal, nil
}

// DeleteNetwork deletes the network for the specified driver type
func (d *driver) DeleteNetwork(nid string) error {
        n := d.network(nid)
        if n == nil {
                return fmt.Errorf("network id %s not found", nid)
        }
        // if the driver created the slave interface, delete it, otherwise leave it
        if ok := n.config.CreatedSlaveLink; ok {
                // if the interface exists, only delete if it matches iface.vlan or dummy.net_id naming
                if ok := parentExists(n.config.Parent); ok {
                        // only delete the link if it is named the net_id
                        if n.config.Parent == getDummyName(nid) {
                                err := delDummyLink(n.config.Parent)
                                if err != nil {
                                        log.G(context.TODO()).Debugf("link %s was not deleted, continuing the delete network operation: %v",
                                                n.config.Parent, err)
                                }
                        } else {
                                // only delete the link if it matches iface.vlan naming
                                err := delVlanLink(n.config.Parent)
                                if err != nil {
                                        log.G(context.TODO()).Debugf("link %s was not deleted, continuing the delete network operation: %v",
                                                n.config.Parent, err)
                                }
                        }
                }
        }
        for _, ep := range n.endpoints {
                if link, err := ns.NlHandle().LinkByName(ep.srcName); err == nil {
                        if err := ns.NlHandle().LinkDel(link); err != nil {
                                log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.srcName, ep.id)
                        }
                }

                if err := d.storeDelete(ep); err != nil {
                        log.G(context.TODO()).Warnf("Failed to remove ipvlan endpoint %.7s from store: %v", ep.id, err)
                }
        }
        // delete the *network
        d.deleteNetwork(nid)
        // delete the network record from persistent cache
        err := d.storeDelete(n.config)
        if err != nil {
                return fmt.Errorf("error deleting id %s from datastore: %v", nid, err)
        }
        return nil
}

// parseNetworkOptions parses docker network options
func parseNetworkOptions(id string, option options.Generic) (*configuration, error) {
        var (
                err    error
                config = &configuration{}
        )
        // parse generic labels first
        if genData, ok := option[netlabel.GenericData]; ok && genData != nil {
                if config, err = parseNetworkGenericOptions(genData); err != nil {
                        return nil, err
                }
        }
        if val, ok := option[netlabel.Internal]; ok {
                if internal, ok := val.(bool); ok && internal {
                        config.Internal = true
                }
        }

        // verify the ipvlan mode from -o ipvlan_mode option
        switch config.IpvlanMode {
        case "":
                // default to ipvlan L2 mode if -o ipvlan_mode is empty
                config.IpvlanMode = modeL2
        case modeL2, modeL3, modeL3S:
                // valid option
        default:
                return nil, fmt.Errorf("requested ipvlan mode '%s' is not valid, 'l2' mode is the ipvlan driver default", config.IpvlanMode)
        }

        // verify the ipvlan flag from -o ipvlan_flag option
        switch config.IpvlanFlag {
        case "":
                // default to bridge if -o ipvlan_flag is empty
                config.IpvlanFlag = flagBridge
        case flagBridge, flagPrivate, flagVepa:
                // valid option
        default:
                return nil, fmt.Errorf("requested ipvlan flag '%s' is not valid, 'bridge' is the ipvlan driver default", config.IpvlanFlag)
        }

        // loopback is not a valid parent link
        if config.Parent == "lo" {
                return nil, errors.New("loopback interface is not a valid ipvlan parent link")
        }

        // With no parent interface, the network is "internal".
        if config.Parent == "" {
                config.Internal = true
        }

        config.ID = id
        return config, nil
}

// parseNetworkGenericOptions parse generic driver docker network options
func parseNetworkGenericOptions(data interface{}) (*configuration, error) {
        switch opt := data.(type) {
        case *configuration:
                return opt, nil
        case map[string]string:
                return newConfigFromLabels(opt), nil
        default:
                return nil, types.InvalidParameterErrorf("unrecognized network configuration format: %v", opt)
        }
}

// newConfigFromLabels creates a new configuration from the given labels.
func newConfigFromLabels(labels map[string]string) *configuration {
        config := &configuration{}
        for label, value := range labels {
                switch label {
                case parentOpt:
                        // parse driver option '-o parent'
                        config.Parent = value
                case driverModeOpt:
                        // parse driver option '-o ipvlan_mode'
                        config.IpvlanMode = value
                case driverFlagOpt:
                        // parse driver option '-o ipvlan_flag'
                        config.IpvlanFlag = value
                }
        }

        return config
}

// processIPAM parses v4 and v6 IP information and binds it to the network configuration
func (config *configuration) processIPAM(ipamV4Data, ipamV6Data []driverapi.IPAMData) {
        for _, ipd := range ipamV4Data {
                config.Ipv4Subnets = append(config.Ipv4Subnets, &ipSubnet{
                        SubnetIP: ipd.Pool.String(),
                        GwIP:     ipd.Gateway.String(),
                })
        }
        for _, ipd := range ipamV6Data {
                config.Ipv6Subnets = append(config.Ipv6Subnets, &ipSubnet{
                        SubnetIP: ipd.Pool.String(),
                        GwIP:     ipd.Gateway.String(),
                })
        }
}

//go:build linux

package ipvlan

import (
        "context"
        "fmt"
        "strconv"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/vishvananda/netlink"
)

const (
        dummyPrefix     = "di-" // prefix for dummy ipvlan parent interfaces.
        dummyIDLength   = 12    // length for dummy parent interface IDs.
        ipvlanKernelVer = 4     // minimum ipvlan kernel support
        ipvlanMajorVer  = 2     // minimum ipvlan major kernel support
)

// createIPVlan Create the ipvlan slave specifying the source name
func createIPVlan(containerIfName, parent, ipvlanMode, ipvlanFlag string) (string, error) {
        // Set the ipvlan mode and flag. Default is L2 bridge
        mode, err := setIPVlanMode(ipvlanMode)
        if err != nil {
                return "", fmt.Errorf("Unsupported %s ipvlan mode: %v", ipvlanMode, err)
        }
        // Set the ipvlan flag. Default is bridge
        flag, err := setIPVlanFlag(ipvlanFlag)
        if err != nil {
                return "", fmt.Errorf("Unsupported %s ipvlan flag: %v", ipvlanFlag, err)
        }
        // verify the Docker host interface acting as the macvlan parent iface exists
        if !parentExists(parent) {
                return "", fmt.Errorf("the requested parent interface %s was not found on the Docker host", parent)
        }
        // Get the link for the master index (Example: the docker host eth iface)
        parentLink, err := ns.NlHandle().LinkByName(parent)
        if err != nil {
                return "", fmt.Errorf("error occurred looking up the ipvlan parent iface %s error: %s", parent, err)
        }
        // Create an ipvlan link
        ipvlan := &netlink.IPVlan{
                LinkAttrs: netlink.LinkAttrs{
                        Name:        containerIfName,
                        ParentIndex: parentLink.Attrs().Index,
                },
                Mode: mode,
                Flag: flag,
        }
        if err := ns.NlHandle().LinkAdd(ipvlan); err != nil {
                // If a user creates a macvlan and ipvlan on same parent, only one slave iface can be active at a time.
                return "", fmt.Errorf("failed to create the ipvlan port: %v", err)
        }

        return ipvlan.Attrs().Name, nil
}

// setIPVlanMode setter for one of the three ipvlan port types
func setIPVlanMode(mode string) (netlink.IPVlanMode, error) {
        switch mode {
        case modeL2:
                return netlink.IPVLAN_MODE_L2, nil
        case modeL3:
                return netlink.IPVLAN_MODE_L3, nil
        case modeL3S:
                return netlink.IPVLAN_MODE_L3S, nil
        default:
                return 0, fmt.Errorf("Unknown ipvlan mode: %s", mode)
        }
}

// setIPVlanFlag setter for one of the three ipvlan port flags
func setIPVlanFlag(flag string) (netlink.IPVlanFlag, error) {
        switch flag {
        case flagBridge:
                return netlink.IPVLAN_FLAG_BRIDGE, nil
        case flagPrivate:
                return netlink.IPVLAN_FLAG_PRIVATE, nil
        case flagVepa:
                return netlink.IPVLAN_FLAG_VEPA, nil
        default:
                return 0, fmt.Errorf("unknown ipvlan flag: %s", flag)
        }
}

// parentExists check if the specified interface exists in the default namespace
func parentExists(ifaceStr string) bool {
        _, err := ns.NlHandle().LinkByName(ifaceStr)
        return err == nil
}

// createVlanLink parses sub-interfaces and vlan id for creation
func createVlanLink(parentName string) error {
        if strings.Contains(parentName, ".") {
                parent, vidInt, err := parseVlan(parentName)
                if err != nil {
                        return err
                }
                // VLAN identifier or VID is a 12-bit field specifying the VLAN to which the frame belongs
                if vidInt > 4094 || vidInt < 1 {
                        return fmt.Errorf("vlan id must be between 1-4094, received: %d", vidInt)
                }
                // get the parent link to attach a vlan subinterface
                parentLink, err := ns.NlHandle().LinkByName(parent)
                if err != nil {
                        return fmt.Errorf("failed to find master interface %s on the Docker host: %v", parent, err)
                }
                vlanLink := &netlink.Vlan{
                        LinkAttrs: netlink.LinkAttrs{
                                Name:        parentName,
                                ParentIndex: parentLink.Attrs().Index,
                        },
                        VlanId: vidInt,
                }
                // create the subinterface
                if err := ns.NlHandle().LinkAdd(vlanLink); err != nil {
                        return fmt.Errorf("failed to create %s vlan link: %v", vlanLink.Name, err)
                }
                // Bring the new netlink iface up
                if err := ns.NlHandle().LinkSetUp(vlanLink); err != nil {
                        return fmt.Errorf("failed to enable %s the ipvlan parent link %v", vlanLink.Name, err)
                }
                log.G(context.TODO()).Debugf("Added a vlan tagged netlink subinterface: %s with a vlan id: %d", parentName, vidInt)
                return nil
        }

        return fmt.Errorf("invalid subinterface vlan name %s, example formatting is eth0.10", parentName)
}

// delVlanLink verifies only sub-interfaces with a vlan id get deleted
func delVlanLink(linkName string) error {
        if strings.Contains(linkName, ".") {
                _, _, err := parseVlan(linkName)
                if err != nil {
                        return err
                }
                // delete the vlan subinterface
                vlanLink, err := ns.NlHandle().LinkByName(linkName)
                if err != nil {
                        return fmt.Errorf("failed to find interface %s on the Docker host : %v", linkName, err)
                }
                // verify a parent interface isn't being deleted
                if vlanLink.Attrs().ParentIndex == 0 {
                        return fmt.Errorf("interface %s does not appear to be a slave device: %v", linkName, err)
                }
                // delete the ipvlan slave device
                if err := ns.NlHandle().LinkDel(vlanLink); err != nil {
                        return fmt.Errorf("failed to delete  %s link: %v", linkName, err)
                }
                log.G(context.TODO()).Debugf("Deleted a vlan tagged netlink subinterface: %s", linkName)
        }
        // if the subinterface doesn't parse to iface.vlan_id leave the interface in
        // place since it could be a user specified name not created by the driver.
        return nil
}

// parseVlan parses and verifies a slave interface name: -o parent=eth0.10
func parseVlan(linkName string) (string, int, error) {
        // parse -o parent=eth0.10
        splitName := strings.Split(linkName, ".")
        if len(splitName) != 2 {
                return "", 0, fmt.Errorf("required interface name format is: name.vlan_id, ex. eth0.10 for vlan 10, instead received %s", linkName)
        }
        parent, vidStr := splitName[0], splitName[1]
        // validate type and convert vlan id to int
        vidInt, err := strconv.Atoi(vidStr)
        if err != nil {
                return "", 0, fmt.Errorf("unable to parse a valid vlan id from: %s (ex. eth0.10 for vlan 10)", vidStr)
        }
        // Check if the interface exists
        if !parentExists(parent) {
                return "", 0, fmt.Errorf("-o parent interface was not found on the host: %s", parent)
        }

        return parent, vidInt, nil
}

// createDummyLink creates a dummy0 parent link
func createDummyLink(dummyName, truncNetID string) error {
        // create a parent interface since one was not specified
        parent := &netlink.Dummy{
                LinkAttrs: netlink.LinkAttrs{
                        Name: dummyName,
                },
        }
        if err := ns.NlHandle().LinkAdd(parent); err != nil {
                return err
        }
        parentDummyLink, err := ns.NlHandle().LinkByName(dummyName)
        if err != nil {
                return fmt.Errorf("error occurred looking up the ipvlan parent iface %s error: %s", dummyName, err)
        }
        // bring the new netlink iface up
        if err := ns.NlHandle().LinkSetUp(parentDummyLink); err != nil {
                return fmt.Errorf("failed to enable %s the ipvlan parent link: %v", dummyName, err)
        }

        return nil
}

// delDummyLink deletes the link type dummy used when -o parent is not passed
func delDummyLink(linkName string) error {
        // delete the vlan subinterface
        dummyLink, err := ns.NlHandle().LinkByName(linkName)
        if err != nil {
                return fmt.Errorf("failed to find link %s on the Docker host : %v", linkName, err)
        }
        // verify a parent interface is being deleted
        if dummyLink.Attrs().ParentIndex != 0 {
                return fmt.Errorf("link %s is not a parent dummy interface", linkName)
        }
        // delete the ipvlan dummy device
        if err := ns.NlHandle().LinkDel(dummyLink); err != nil {
                return fmt.Errorf("failed to delete the dummy %s link: %v", linkName, err)
        }
        log.G(context.TODO()).Debugf("Deleted a dummy parent link: %s", linkName)

        return nil
}

// getDummyName returns the name of a dummy parent with truncated net ID and driver prefix
func getDummyName(netID string) string {
        if len(netID) > dummyIDLength {
                netID = netID[:dummyIDLength]
        }
        return dummyPrefix + netID
}

//go:build linux

package ipvlan

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
)

func (d *driver) network(nid string) *network {
        d.Lock()
        n, ok := d.networks[nid]
        d.Unlock()
        if !ok {
                log.G(context.TODO()).Errorf("network id %s not found", nid)
        }

        return n
}

func (d *driver) addNetwork(n *network) {
        d.Lock()
        d.networks[n.id] = n
        d.Unlock()
}

func (d *driver) deleteNetwork(nid string) {
        d.Lock()
        delete(d.networks, nid)
        d.Unlock()
}

// getNetworks Safely returns a slice of existing networks
func (d *driver) getNetworks() []*network {
        d.Lock()
        defer d.Unlock()

        ls := make([]*network, 0, len(d.networks))
        for _, nw := range d.networks {
                ls = append(ls, nw)
        }

        return ls
}

func (n *network) endpoint(eid string) *endpoint {
        n.Lock()
        defer n.Unlock()

        return n.endpoints[eid]
}

func (n *network) addEndpoint(ep *endpoint) {
        n.Lock()
        n.endpoints[ep.id] = ep
        n.Unlock()
}

func (n *network) deleteEndpoint(eid string) {
        n.Lock()
        delete(n.endpoints, eid)
        n.Unlock()
}

func (n *network) getEndpoint(eid string) (*endpoint, error) {
        n.Lock()
        defer n.Unlock()
        if eid == "" {
                return nil, fmt.Errorf("endpoint id %s not found", eid)
        }
        if ep, ok := n.endpoints[eid]; ok {
                return ep, nil
        }

        return nil, nil
}

func validateID(nid, eid string) error {
        if nid == "" {
                return errors.New("invalid network id")
        }
        if eid == "" {
                return errors.New("invalid endpoint id")
        }

        return nil
}

func (d *driver) getNetwork(id string) (*network, error) {
        d.Lock()
        defer d.Unlock()
        if id == "" {
                return nil, types.InvalidParameterErrorf("invalid network id: %s", id)
        }

        if nw, ok := d.networks[id]; ok {
                return nw, nil
        }

        return nil, types.NotFoundErrorf("network not found: %s", id)
}

//go:build linux

package ipvlan

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        ipvlanPrefix         = "ipvlan"
        ipvlanNetworkPrefix  = ipvlanPrefix + "/network"
        ipvlanEndpointPrefix = ipvlanPrefix + "/endpoint"
)

// networkConfiguration for this driver's network specific configuration
type configuration struct {
        ID               string
        Mtu              int
        dbIndex          uint64
        dbExists         bool
        Internal         bool
        Parent           string
        IpvlanMode       string
        IpvlanFlag       string
        CreatedSlaveLink bool
        Ipv4Subnets      []*ipSubnet
        Ipv6Subnets      []*ipSubnet
}

type ipSubnet struct {
        SubnetIP string
        GwIP     string
}

// initStore drivers are responsible for caching their own persistent state
func (d *driver) initStore() error {
        err := d.populateNetworks()
        if err != nil {
                return err
        }
        err = d.populateEndpoints()
        if err != nil {
                return err
        }

        return nil
}

// populateNetworks is invoked at driver init to recreate persistently stored networks
func (d *driver) populateNetworks() error {
        kvol, err := d.store.List(&configuration{})
        if err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                return fmt.Errorf("failed to get ipvlan network configurations from store: %w", err)
        }
        // If empty it simply means no ipvlan networks have been created yet
        if errors.Is(err, datastore.ErrKeyNotFound) {
                return nil
        }
        for _, kvo := range kvol {
                config := kvo.(*configuration)
                if _, err = d.createNetwork(config); err != nil {
                        log.G(context.TODO()).Warnf("could not create ipvlan network for id %s from persistent state", config.ID)
                }
        }

        return nil
}

func (d *driver) populateEndpoints() error {
        kvol, err := d.store.List(&endpoint{})
        if err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                return fmt.Errorf("failed to get ipvlan endpoints from store: %w", err)
        }

        if errors.Is(err, datastore.ErrKeyNotFound) {
                return nil
        }

        for _, kvo := range kvol {
                ep := kvo.(*endpoint)
                n, ok := d.networks[ep.nid]
                if !ok {
                        log.G(context.TODO()).Debugf("Network (%.7s) not found for restored ipvlan endpoint (%.7s)", ep.nid, ep.id)
                        log.G(context.TODO()).Debugf("Deleting stale ipvlan endpoint (%.7s) from store", ep.id)
                        if err := d.storeDelete(ep); err != nil {
                                log.G(context.TODO()).Debugf("Failed to delete stale ipvlan endpoint (%.7s) from store", ep.id)
                        }
                        continue
                }
                n.endpoints[ep.id] = ep
                log.G(context.TODO()).Debugf("Endpoint (%.7s) restored to network (%.7s)", ep.id, ep.nid)
        }

        return nil
}

// storeUpdate used to update persistent ipvlan network records as they are created
func (d *driver) storeUpdate(kvObject datastore.KVObject) error {
        if d.store == nil {
                log.G(context.TODO()).Warnf("ipvlan store not initialized. kv object %s is not added to the store", datastore.Key(kvObject.Key()...))
                return nil
        }
        if err := d.store.PutObjectAtomic(kvObject); err != nil {
                return fmt.Errorf("failed to update ipvlan store for object type %T: %v", kvObject, err)
        }

        return nil
}

// storeDelete used to delete ipvlan network records from persistent cache as they are deleted
func (d *driver) storeDelete(kvObject datastore.KVObject) error {
        if d.store == nil {
                log.G(context.TODO()).Debugf("ipvlan store not initialized. kv object %s is not deleted from store", datastore.Key(kvObject.Key()...))
                return nil
        }

        return d.store.DeleteObject(kvObject)
}

func (config *configuration) MarshalJSON() ([]byte, error) {
        nMap := make(map[string]interface{})
        nMap["ID"] = config.ID
        nMap["Mtu"] = config.Mtu
        nMap["Parent"] = config.Parent
        nMap["IpvlanMode"] = config.IpvlanMode
        nMap["IpvlanFlag"] = config.IpvlanFlag
        nMap["Internal"] = config.Internal
        nMap["CreatedSubIface"] = config.CreatedSlaveLink
        if len(config.Ipv4Subnets) > 0 {
                iis, err := json.Marshal(config.Ipv4Subnets)
                if err != nil {
                        return nil, err
                }
                nMap["Ipv4Subnets"] = string(iis)
        }
        if len(config.Ipv6Subnets) > 0 {
                iis, err := json.Marshal(config.Ipv6Subnets)
                if err != nil {
                        return nil, err
                }
                nMap["Ipv6Subnets"] = string(iis)
        }

        return json.Marshal(nMap)
}

func (config *configuration) UnmarshalJSON(b []byte) error {
        var (
                err  error
                nMap map[string]interface{}
        )

        if err = json.Unmarshal(b, &nMap); err != nil {
                return err
        }
        config.ID = nMap["ID"].(string)
        config.Mtu = int(nMap["Mtu"].(float64))
        config.Parent = nMap["Parent"].(string)
        config.IpvlanMode = nMap["IpvlanMode"].(string)
        if v, ok := nMap["IpvlanFlag"]; ok {
                config.IpvlanFlag = v.(string)
        } else {
                // Migrate config from an older daemon which did not have the flag configurable.
                config.IpvlanFlag = flagBridge
        }
        config.Internal = nMap["Internal"].(bool)
        config.CreatedSlaveLink = nMap["CreatedSubIface"].(bool)
        if v, ok := nMap["Ipv4Subnets"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &config.Ipv4Subnets); err != nil {
                        return err
                }
        }
        if v, ok := nMap["Ipv6Subnets"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &config.Ipv6Subnets); err != nil {
                        return err
                }
        }

        return nil
}

func (config *configuration) Key() []string {
        return []string{ipvlanNetworkPrefix, config.ID}
}

func (config *configuration) KeyPrefix() []string {
        return []string{ipvlanNetworkPrefix}
}

func (config *configuration) Value() []byte {
        b, err := json.Marshal(config)
        if err != nil {
                return nil
        }
        return b
}

func (config *configuration) SetValue(value []byte) error {
        return json.Unmarshal(value, config)
}

func (config *configuration) Index() uint64 {
        return config.dbIndex
}

func (config *configuration) SetIndex(index uint64) {
        config.dbIndex = index
        config.dbExists = true
}

func (config *configuration) Exists() bool {
        return config.dbExists
}

func (config *configuration) Skip() bool {
        return false
}

func (config *configuration) New() datastore.KVObject {
        return &configuration{}
}

func (config *configuration) CopyTo(o datastore.KVObject) error {
        dstNcfg := o.(*configuration)
        *dstNcfg = *config
        return nil
}

func (ep *endpoint) MarshalJSON() ([]byte, error) {
        epMap := make(map[string]interface{})
        epMap["id"] = ep.id
        epMap["nid"] = ep.nid
        epMap["SrcName"] = ep.srcName
        if len(ep.mac) != 0 {
                epMap["MacAddress"] = ep.mac.String()
        }
        if ep.addr != nil {
                epMap["Addr"] = ep.addr.String()
        }
        if ep.addrv6 != nil {
                epMap["Addrv6"] = ep.addrv6.String()
        }
        return json.Marshal(epMap)
}

func (ep *endpoint) UnmarshalJSON(b []byte) error {
        var (
                err   error
                epMap map[string]interface{}
        )

        if err = json.Unmarshal(b, &epMap); err != nil {
                return fmt.Errorf("Failed to unmarshal to ipvlan endpoint: %v", err)
        }

        if v, ok := epMap["MacAddress"]; ok {
                if ep.mac, err = net.ParseMAC(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode ipvlan endpoint MAC address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        if v, ok := epMap["Addr"]; ok {
                if ep.addr, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode ipvlan endpoint IPv4 address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        if v, ok := epMap["Addrv6"]; ok {
                if ep.addrv6, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode ipvlan endpoint IPv6 address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        ep.id = epMap["id"].(string)
        ep.nid = epMap["nid"].(string)
        ep.srcName = epMap["SrcName"].(string)

        return nil
}

func (ep *endpoint) Key() []string {
        return []string{ipvlanEndpointPrefix, ep.id}
}

func (ep *endpoint) KeyPrefix() []string {
        return []string{ipvlanEndpointPrefix}
}

func (ep *endpoint) Value() []byte {
        b, err := json.Marshal(ep)
        if err != nil {
                return nil
        }
        return b
}

func (ep *endpoint) SetValue(value []byte) error {
        return json.Unmarshal(value, ep)
}

func (ep *endpoint) Index() uint64 {
        return ep.dbIndex
}

func (ep *endpoint) SetIndex(index uint64) {
        ep.dbIndex = index
        ep.dbExists = true
}

func (ep *endpoint) Exists() bool {
        return ep.dbExists
}

func (ep *endpoint) Skip() bool {
        return false
}

func (ep *endpoint) New() datastore.KVObject {
        return &endpoint{}
}

func (ep *endpoint) CopyTo(o datastore.KVObject) error {
        dstEp := o.(*endpoint)
        *dstEp = *ep
        return nil
}

//go:build linux

package macvlan

import (
        "net"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        containerVethPrefix = "eth"
        vethPrefix          = "veth"
        vethLen             = len(vethPrefix) + 7
        NetworkType         = "macvlan"      // driver type name
        modePrivate         = "private"      // macvlan mode private
        modeVepa            = "vepa"         // macvlan mode vepa
        modeBridge          = "bridge"       // macvlan mode bridge
        modePassthru        = "passthru"     // macvlan mode passthrough
        parentOpt           = "parent"       // parent interface -o parent
        driverModeOpt       = "macvlan_mode" // macvlan mode ux opt suffix
)

type endpointTable map[string]*endpoint

type networkTable map[string]*network

type driver struct {
        networks networkTable
        sync.Once
        sync.Mutex
        store *datastore.Store
}

type endpoint struct {
        id       string
        nid      string
        mac      net.HardwareAddr
        addr     *net.IPNet
        addrv6   *net.IPNet
        srcName  string
        dbIndex  uint64
        dbExists bool
}

type network struct {
        id        string
        endpoints endpointTable
        driver    *driver
        config    *configuration
        sync.Mutex
}

// Register initializes and registers the libnetwork macvlan driver
func Register(r driverapi.Registerer, store *datastore.Store, _ map[string]interface{}) error {
        d := &driver{
                store:    store,
                networks: networkTable{},
        }
        if err := d.initStore(); err != nil {
                return err
        }
        return r.RegisterDriver(NetworkType, d, driverapi.Capability{
                DataScope:         scope.Local,
                ConnectivityScope: scope.Global,
        })
}

func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
        return nil, types.NotImplementedErrorf("not implemented")
}

func (d *driver) NetworkFree(id string) error {
        return types.NotImplementedErrorf("not implemented")
}

func (d *driver) EndpointOperInfo(nid, eid string) (map[string]interface{}, error) {
        return make(map[string]interface{}), nil
}

func (d *driver) Type() string {
        return NetworkType
}

func (d *driver) IsBuiltIn() bool {
        return true
}

func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key string, value []byte) {
}

func (d *driver) DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string) {
        return "", nil
}

//go:build linux

package macvlan

import (
        "context"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
)

// CreateEndpoint assigns the mac, ip and endpoint id for the new container
func (d *driver) CreateEndpoint(ctx context.Context, nid, eid string, ifInfo driverapi.InterfaceInfo, epOptions map[string]interface{}) error {
        if err := validateID(nid, eid); err != nil {
                return err
        }
        n, err := d.getNetwork(nid)
        if err != nil {
                return errdefs.System(fmt.Errorf("network id %q not found", nid))
        }
        ep := &endpoint{
                id:     eid,
                nid:    nid,
                addr:   ifInfo.Address(),
                addrv6: ifInfo.AddressIPv6(),
                mac:    ifInfo.MacAddress(),
        }
        if ep.mac == nil {
                ep.mac = netutils.GenerateRandomMAC()
                if err := ifInfo.SetMacAddress(ep.mac); err != nil {
                        return err
                }
        }
        // disallow portmapping -p
        if opt, ok := epOptions[netlabel.PortMap]; ok {
                if _, ok := opt.([]types.PortBinding); ok {
                        if len(opt.([]types.PortBinding)) > 0 {
                                log.G(ctx).Warnf("macvlan driver does not support port mappings")
                        }
                }
        }
        // disallow port exposure --expose
        if opt, ok := epOptions[netlabel.ExposedPorts]; ok {
                if _, ok := opt.([]types.TransportPort); ok {
                        if len(opt.([]types.TransportPort)) > 0 {
                                log.G(ctx).Warnf("macvlan driver does not support port exposures")
                        }
                }
        }

        if err := d.storeUpdate(ep); err != nil {
                return fmt.Errorf("failed to save macvlan endpoint %.7s to store: %v", ep.id, err)
        }

        n.addEndpoint(ep)

        return nil
}

// DeleteEndpoint removes the endpoint and associated netlink interface
func (d *driver) DeleteEndpoint(nid, eid string) error {
        if err := validateID(nid, eid); err != nil {
                return err
        }
        n := d.network(nid)
        if n == nil {
                return fmt.Errorf("network id %q not found", nid)
        }
        ep := n.endpoint(eid)
        if ep == nil {
                return fmt.Errorf("endpoint id %q not found", eid)
        }
        if link, err := ns.NlHandle().LinkByName(ep.srcName); err == nil {
                if err := ns.NlHandle().LinkDel(link); err != nil {
                        log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.srcName, ep.id)
                }
        }

        if err := d.storeDelete(ep); err != nil {
                log.G(context.TODO()).Warnf("Failed to remove macvlan endpoint %.7s from store: %v", ep.id, err)
        }

        n.deleteEndpoint(ep.id)

        return nil
}

//go:build linux

package macvlan

import (
        "context"
        "fmt"
        "net"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

// Join method is invoked when a Sandbox is attached to an endpoint.
func (d *driver) Join(ctx context.Context, nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, epOpts, _ map[string]interface{}) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.drivers.macvlan.Join", trace.WithAttributes(
                attribute.String("nid", nid),
                attribute.String("eid", eid),
                attribute.String("sboxKey", sboxKey)))
        defer span.End()

        n, err := d.getNetwork(nid)
        if err != nil {
                return err
        }
        endpoint := n.endpoint(eid)
        if endpoint == nil {
                return fmt.Errorf("could not find endpoint with id %s", eid)
        }
        // generate a name for the iface that will be renamed to eth0 in the sbox
        containerIfName, err := netutils.GenerateIfaceName(ns.NlHandle(), vethPrefix, vethLen)
        if err != nil {
                return fmt.Errorf("error generating an interface name: %s", err)
        }
        // create the netlink macvlan interface
        vethName, err := createMacVlan(containerIfName, n.config.Parent, n.config.MacvlanMode)
        if err != nil {
                return err
        }
        // bind the generated iface name to the endpoint
        endpoint.srcName = vethName
        ep := n.endpoint(eid)
        if ep == nil {
                return fmt.Errorf("could not find endpoint with id %s", eid)
        }
        // parse and match the endpoint address with the available v4 subnets
        if !n.config.Internal {
                if len(n.config.Ipv4Subnets) > 0 {
                        s := n.getSubnetforIPv4(ep.addr)
                        if s == nil {
                                return fmt.Errorf("could not find a valid ipv4 subnet for endpoint %s", eid)
                        }
                        v4gw, _, err := net.ParseCIDR(s.GwIP)
                        if err != nil {
                                return fmt.Errorf("gateway %s is not a valid ipv4 address: %v", s.GwIP, err)
                        }
                        err = jinfo.SetGateway(v4gw)
                        if err != nil {
                                return err
                        }
                        log.G(ctx).Debugf("Macvlan Endpoint Joined with IPv4_Addr: %s, Gateway: %s, MacVlan_Mode: %s, Parent: %s",
                                ep.addr.IP.String(), v4gw.String(), n.config.MacvlanMode, n.config.Parent)
                }
                // parse and match the endpoint address with the available v6 subnets
                if ep.addrv6 != nil && len(n.config.Ipv6Subnets) > 0 {
                        s := n.getSubnetforIPv6(ep.addrv6)
                        if s == nil {
                                return fmt.Errorf("could not find a valid ipv6 subnet for endpoint %s", eid)
                        }
                        v6gw, _, err := net.ParseCIDR(s.GwIP)
                        if err != nil {
                                return fmt.Errorf("gateway %s is not a valid ipv6 address: %v", s.GwIP, err)
                        }
                        err = jinfo.SetGatewayIPv6(v6gw)
                        if err != nil {
                                return err
                        }
                        log.G(ctx).Debugf("Macvlan Endpoint Joined with IPv6_Addr: %s Gateway: %s MacVlan_Mode: %s, Parent: %s",
                                ep.addrv6.IP.String(), v6gw.String(), n.config.MacvlanMode, n.config.Parent)
                }
                if len(n.config.Ipv4Subnets) == 0 && len(n.config.Ipv6Subnets) == 0 {
                        // With no addresses, don't need a gateway.
                        jinfo.DisableGatewayService()
                }
        } else {
                if len(n.config.Ipv4Subnets) > 0 {
                        log.G(ctx).Debugf("Macvlan Endpoint Joined with IPv4_Addr: %s, MacVlan_Mode: %s, Parent: %s",
                                ep.addr.IP.String(), n.config.MacvlanMode, n.config.Parent)
                }
                if len(n.config.Ipv6Subnets) > 0 {
                        log.G(ctx).Debugf("Macvlan Endpoint Joined with IPv6_Addr: %s MacVlan_Mode: %s, Parent: %s",
                                ep.addrv6.IP.String(), n.config.MacvlanMode, n.config.Parent)
                }
                // If n.config.Internal was set locally by the driver because there's no parent
                // interface, libnetwork doesn't know the network is internal. So, stop it from
                // adding a gateway endpoint.
                jinfo.DisableGatewayService()
        }
        iNames := jinfo.InterfaceName()
        err = iNames.SetNames(vethName, containerVethPrefix, netlabel.GetIfname(epOpts))
        if err != nil {
                return err
        }
        if err := d.storeUpdate(ep); err != nil {
                return fmt.Errorf("failed to save macvlan endpoint %.7s to store: %v", ep.id, err)
        }

        return nil
}

// Leave method is invoked when a Sandbox detaches from an endpoint.
func (d *driver) Leave(nid, eid string) error {
        network, err := d.getNetwork(nid)
        if err != nil {
                return err
        }
        endpoint, err := network.getEndpoint(eid)
        if err != nil {
                return err
        }
        if endpoint == nil {
                return fmt.Errorf("could not find endpoint with id %s", eid)
        }

        return nil
}

// getSubnetforIPv4 returns the ipv4 subnet to which the given IP belongs
func (n *network) getSubnetforIPv4(ip *net.IPNet) *ipSubnet {
        return getSubnetForIP(ip, n.config.Ipv4Subnets)
}

// getSubnetforIPv6 returns the ipv6 subnet to which the given IP belongs
func (n *network) getSubnetforIPv6(ip *net.IPNet) *ipSubnet {
        return getSubnetForIP(ip, n.config.Ipv6Subnets)
}

func getSubnetForIP(ip *net.IPNet, subnets []*ipSubnet) *ipSubnet {
        for _, s := range subnets {
                _, snet, err := net.ParseCIDR(s.SubnetIP)
                if err != nil {
                        return nil
                }
                // first check if the mask lengths are the same
                i, _ := snet.Mask.Size()
                j, _ := ip.Mask.Size()
                if i != j {
                        continue
                }
                if snet.Contains(ip.IP) {
                        return s
                }
        }

        return nil
}

//go:build linux

package macvlan

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/options"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
)

// CreateNetwork the network for the specified driver type
func (d *driver) CreateNetwork(ctx context.Context, nid string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
        // reject a null v4 network if ipv4 is required
        if v, ok := option[netlabel.EnableIPv4]; ok && v.(bool) {
                if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
                        return errdefs.InvalidParameter(errors.New("ipv4 pool is empty"))
                }
        }
        // reject a null v6 network if ipv6 is required
        if v, ok := option[netlabel.EnableIPv6]; ok && v.(bool) {
                if len(ipV6Data) == 0 || ipV6Data[0].Pool.String() == "::/0" {
                        return errdefs.InvalidParameter(errors.New("ipv6 pool is empty"))
                }
        }

        // parse and validate the config and bind to networkConfiguration
        config, err := parseNetworkOptions(nid, option)
        if err != nil {
                return err
        }
        config.processIPAM(ipV4Data, ipV6Data)

        // if parent interface not specified, create a dummy type link to use named dummy+net_id
        if config.Parent == "" {
                config.Parent = getDummyName(config.ID)
        }
        foundExisting, err := d.createNetwork(config)
        if err != nil {
                return err
        }

        if foundExisting {
                return types.InternalMaskableErrorf("restoring existing network %s", config.ID)
        }

        // update persistent db, rollback on fail
        err = d.storeUpdate(config)
        if err != nil {
                d.deleteNetwork(config.ID)
                log.G(context.TODO()).Debugf("encountered an error rolling back a network create for %s : %v", config.ID, err)
                return err
        }

        return nil
}

func (d *driver) GetSkipGwAlloc(opts options.Generic) (ipv4, ipv6 bool, _ error) {
        cfg, err := parseNetworkOptions("dummy", opts)
        if err != nil {
                return false, false, err
        }
        // "--internal" networks don't need a gateway address.
        return cfg.Internal, cfg.Internal, nil
}

// createNetwork is used by new network callbacks and persistent network cache
func (d *driver) createNetwork(config *configuration) (bool, error) {
        foundExisting := false
        networkList := d.getNetworks()
        for _, nw := range networkList {
                if config.Parent == nw.config.Parent {
                        if config.ID != nw.config.ID {
                                if config.MacvlanMode == modePassthru {
                                        return false, fmt.Errorf(
                                                "cannot use mode passthru, macvlan network %s is already using parent interface %s",
                                                nw.config.ID,
                                                config.Parent,
                                        )
                                } else if nw.config.MacvlanMode == modePassthru {
                                        return false, fmt.Errorf(
                                                "macvlan network %s is already using parent interface %s in mode passthru",
                                                nw.config.ID,
                                                config.Parent,
                                        )
                                }
                                continue
                        }
                        log.G(context.TODO()).Debugf("Create Network for the same ID %s\n", config.ID)
                        foundExisting = true
                        break
                }
        }
        if !parentExists(config.Parent) {
                // Create a dummy link if a dummy name is set for parent
                if dummyName := getDummyName(config.ID); dummyName == config.Parent {
                        err := createDummyLink(config.Parent, dummyName)
                        if err != nil {
                                return false, err
                        }
                        config.CreatedSlaveLink = true

                        // notify the user in logs that they have limited communications
                        log.G(context.TODO()).Debugf("Empty -o parent= flags limit communications to other containers inside of network: %s",
                                config.Parent)
                } else {
                        // if the subinterface parent_iface.vlan_id checks do not pass, return err.
                        //  a valid example is 'eth0.10' for a parent iface 'eth0' with a vlan id '10'
                        err := createVlanLink(config.Parent)
                        if err != nil {
                                return false, err
                        }
                        // if driver created the networks slave link, record it for future deletion
                        config.CreatedSlaveLink = true
                }
        } else {
                // Check and mark this network if the interface was created for another network
                for _, testN := range d.getNetworks() {
                        if config.Parent == testN.config.Parent && testN.config.CreatedSlaveLink {
                                config.CreatedSlaveLink = true
                                break
                        }
                }
        }
        if !foundExisting {
                d.addNetwork(&network{
                        id:        config.ID,
                        driver:    d,
                        endpoints: endpointTable{},
                        config:    config,
                })
        }

        return foundExisting, nil
}

func (d *driver) parentHasSingleUser(n *network) bool {
        users := 0
        networkList := d.getNetworks()
        for _, testN := range networkList {
                if n.config.Parent == testN.config.Parent {
                        users++
                }
        }
        return users == 1
}

// DeleteNetwork deletes the network for the specified driver type
func (d *driver) DeleteNetwork(nid string) error {
        n := d.network(nid)
        if n == nil {
                return fmt.Errorf("network id %s not found", nid)
        }
        // if the driver created the slave interface and this network is the last user, delete it, otherwise leave it
        if n.config.CreatedSlaveLink && parentExists(n.config.Parent) && d.parentHasSingleUser(n) {
                // only delete the link if it is named the net_id
                if n.config.Parent == getDummyName(nid) {
                        err := delDummyLink(n.config.Parent)
                        if err != nil {
                                log.G(context.TODO()).Debugf("link %s was not deleted, continuing the delete network operation: %v",
                                        n.config.Parent, err)
                        }
                } else {
                        // only delete the link if it matches iface.vlan naming
                        err := delVlanLink(n.config.Parent)
                        if err != nil {
                                log.G(context.TODO()).Debugf("link %s was not deleted, continuing the delete network operation: %v",
                                        n.config.Parent, err)
                        }
                }
        }
        for _, ep := range n.endpoints {
                if link, err := ns.NlHandle().LinkByName(ep.srcName); err == nil {
                        if err := ns.NlHandle().LinkDel(link); err != nil {
                                log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.srcName, ep.id)
                        }
                }

                if err := d.storeDelete(ep); err != nil {
                        log.G(context.TODO()).Warnf("Failed to remove macvlan endpoint %.7s from store: %v", ep.id, err)
                }
        }
        // delete the *network
        d.deleteNetwork(nid)
        // delete the network record from persistent cache
        err := d.storeDelete(n.config)
        if err != nil {
                return fmt.Errorf("error deleting id %s from datastore: %v", nid, err)
        }
        return nil
}

// parseNetworkOptions parses docker network options
func parseNetworkOptions(id string, option options.Generic) (*configuration, error) {
        var (
                err    error
                config = &configuration{}
        )
        // parse generic labels first
        if genData, ok := option[netlabel.GenericData]; ok && genData != nil {
                if config, err = parseNetworkGenericOptions(genData); err != nil {
                        return nil, err
                }
        }
        if val, ok := option[netlabel.Internal]; ok {
                if internal, ok := val.(bool); ok && internal {
                        config.Internal = true
                }
        }

        // verify the macvlan mode from -o macvlan_mode option
        switch config.MacvlanMode {
        case "":
                // default to macvlan bridge mode if -o macvlan_mode is empty
                config.MacvlanMode = modeBridge
        case modeBridge, modePrivate, modePassthru, modeVepa:
                // valid option
        default:
                return nil, fmt.Errorf("requested macvlan mode '%s' is not valid, 'bridge' mode is the macvlan driver default", config.MacvlanMode)
        }

        // loopback is not a valid parent link
        if config.Parent == "lo" {
                return nil, errors.New("loopback interface is not a valid macvlan parent link")
        }

        // With no parent interface, the network is "internal".
        if config.Parent == "" {
                config.Internal = true
        }

        config.ID = id
        return config, nil
}

// parseNetworkGenericOptions parses generic driver docker network options
func parseNetworkGenericOptions(data interface{}) (*configuration, error) {
        switch opt := data.(type) {
        case *configuration:
                return opt, nil
        case map[string]string:
                return newConfigFromLabels(opt), nil
        default:
                return nil, types.InvalidParameterErrorf("unrecognized network configuration format: %v", opt)
        }
}

// newConfigFromLabels creates a new configuration from the given labels.
func newConfigFromLabels(labels map[string]string) *configuration {
        config := &configuration{}
        for label, value := range labels {
                switch label {
                case parentOpt:
                        // parse driver option '-o parent'
                        config.Parent = value
                case driverModeOpt:
                        // parse driver option '-o macvlan_mode'
                        config.MacvlanMode = value
                }
        }

        return config
}

// processIPAM parses v4 and v6 IP information and binds it to the network configuration
func (config *configuration) processIPAM(ipamV4Data, ipamV6Data []driverapi.IPAMData) {
        for _, ipd := range ipamV4Data {
                config.Ipv4Subnets = append(config.Ipv4Subnets, &ipSubnet{
                        SubnetIP: ipd.Pool.String(),
                        GwIP:     ipd.Gateway.String(),
                })
        }
        for _, ipd := range ipamV6Data {
                config.Ipv6Subnets = append(config.Ipv6Subnets, &ipSubnet{
                        SubnetIP: ipd.Pool.String(),
                        GwIP:     ipd.Gateway.String(),
                })
        }
}

//go:build linux

package macvlan

import (
        "context"
        "fmt"
        "strconv"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/vishvananda/netlink"
)

const (
        dummyPrefix   = "dm-" // prefix for dummy macvlan parent interfaces.
        dummyIDLength = 12    // length for dummy parent interface IDs.
)

// Create the macvlan slave specifying the source name
func createMacVlan(containerIfName, parent, macvlanMode string) (string, error) {
        // Set the macvlan mode. Default is bridge mode
        mode, err := setMacVlanMode(macvlanMode)
        if err != nil {
                return "", fmt.Errorf("Unsupported %s macvlan mode: %v", macvlanMode, err)
        }
        // verify the Docker host interface acting as the macvlan parent iface exists
        if !parentExists(parent) {
                return "", fmt.Errorf("the requested parent interface %s was not found on the Docker host", parent)
        }
        // Get the link for the master index (Example: the docker host eth iface)
        parentLink, err := ns.NlHandle().LinkByName(parent)
        if err != nil {
                return "", fmt.Errorf("error occurred looking up the macvlan parent iface %s error: %s", parent, err)
        }
        // Create a macvlan link
        macvlan := &netlink.Macvlan{
                LinkAttrs: netlink.LinkAttrs{
                        Name:        containerIfName,
                        ParentIndex: parentLink.Attrs().Index,
                },
                Mode: mode,
        }
        if err := ns.NlHandle().LinkAdd(macvlan); err != nil {
                // If a user creates a macvlan and ipvlan on same parent, only one slave iface can be active at a time.
                return "", fmt.Errorf("failed to create the macvlan port: %v", err)
        }

        return macvlan.Attrs().Name, nil
}

// setMacVlanMode setter for one of the four macvlan port types
func setMacVlanMode(mode string) (netlink.MacvlanMode, error) {
        switch mode {
        case modePrivate:
                return netlink.MACVLAN_MODE_PRIVATE, nil
        case modeVepa:
                return netlink.MACVLAN_MODE_VEPA, nil
        case modeBridge:
                return netlink.MACVLAN_MODE_BRIDGE, nil
        case modePassthru:
                return netlink.MACVLAN_MODE_PASSTHRU, nil
        default:
                return 0, fmt.Errorf("unknown macvlan mode: %s", mode)
        }
}

// parentExists checks if the specified interface exists in the default namespace
func parentExists(ifaceStr string) bool {
        _, err := ns.NlHandle().LinkByName(ifaceStr)
        return err == nil
}

// createVlanLink parses sub-interfaces and vlan id for creation
func createVlanLink(parentName string) error {
        if strings.Contains(parentName, ".") {
                parent, vidInt, err := parseVlan(parentName)
                if err != nil {
                        return err
                }
                // VLAN identifier or VID is a 12-bit field specifying the VLAN to which the frame belongs
                if vidInt > 4094 || vidInt < 1 {
                        return fmt.Errorf("vlan id must be between 1-4094, received: %d", vidInt)
                }
                // get the parent link to attach a vlan subinterface
                parentLink, err := ns.NlHandle().LinkByName(parent)
                if err != nil {
                        return fmt.Errorf("failed to find master interface %s on the Docker host: %v", parent, err)
                }
                vlanLink := &netlink.Vlan{
                        LinkAttrs: netlink.LinkAttrs{
                                Name:        parentName,
                                ParentIndex: parentLink.Attrs().Index,
                        },
                        VlanId: vidInt,
                }
                // create the subinterface
                if err := ns.NlHandle().LinkAdd(vlanLink); err != nil {
                        return fmt.Errorf("failed to create %s vlan link: %v", vlanLink.Name, err)
                }
                // Bring the new netlink iface up
                if err := ns.NlHandle().LinkSetUp(vlanLink); err != nil {
                        return fmt.Errorf("failed to enable %s the macvlan parent link %v", vlanLink.Name, err)
                }
                log.G(context.TODO()).Debugf("Added a vlan tagged netlink subinterface: %s with a vlan id: %d", parentName, vidInt)
                return nil
        }

        return fmt.Errorf("invalid subinterface vlan name %s, example formatting is eth0.10", parentName)
}

// delVlanLink verifies only sub-interfaces with a vlan id get deleted
func delVlanLink(linkName string) error {
        if strings.Contains(linkName, ".") {
                _, _, err := parseVlan(linkName)
                if err != nil {
                        return err
                }
                // delete the vlan subinterface
                vlanLink, err := ns.NlHandle().LinkByName(linkName)
                if err != nil {
                        return fmt.Errorf("failed to find interface %s on the Docker host : %v", linkName, err)
                }
                // verify a parent interface isn't being deleted
                if vlanLink.Attrs().ParentIndex == 0 {
                        return fmt.Errorf("interface %s does not appear to be a slave device: %v", linkName, err)
                }
                // delete the macvlan slave device
                if err := ns.NlHandle().LinkDel(vlanLink); err != nil {
                        return fmt.Errorf("failed to delete  %s link: %v", linkName, err)
                }
                log.G(context.TODO()).Debugf("Deleted a vlan tagged netlink subinterface: %s", linkName)
        }
        // if the subinterface doesn't parse to iface.vlan_id leave the interface in
        // place since it could be a user specified name not created by the driver.
        return nil
}

// parseVlan parses and verifies a slave interface name: -o parent=eth0.10
func parseVlan(linkName string) (string, int, error) {
        // parse -o parent=eth0.10
        splitName := strings.Split(linkName, ".")
        if len(splitName) != 2 {
                return "", 0, fmt.Errorf("required interface name format is: name.vlan_id, ex. eth0.10 for vlan 10, instead received %s", linkName)
        }
        parent, vidStr := splitName[0], splitName[1]
        // validate type and convert vlan id to int
        vidInt, err := strconv.Atoi(vidStr)
        if err != nil {
                return "", 0, fmt.Errorf("unable to parse a valid vlan id from: %s (ex. eth0.10 for vlan 10)", vidStr)
        }
        // Check if the interface exists
        if !parentExists(parent) {
                return "", 0, fmt.Errorf("-o parent interface was not found on the host: %s", parent)
        }

        return parent, vidInt, nil
}

// createDummyLink creates a dummy0 parent link
func createDummyLink(dummyName, truncNetID string) error {
        // create a parent interface since one was not specified
        parent := &netlink.Dummy{
                LinkAttrs: netlink.LinkAttrs{
                        Name: dummyName,
                },
        }
        if err := ns.NlHandle().LinkAdd(parent); err != nil {
                return err
        }
        parentDummyLink, err := ns.NlHandle().LinkByName(dummyName)
        if err != nil {
                return fmt.Errorf("error occurred looking up the macvlan parent iface %s error: %s", dummyName, err)
        }
        // bring the new netlink iface up
        if err := ns.NlHandle().LinkSetUp(parentDummyLink); err != nil {
                return fmt.Errorf("failed to enable %s the macvlan parent link: %v", dummyName, err)
        }

        return nil
}

// delDummyLink deletes the link type dummy used when -o parent is not passed
func delDummyLink(linkName string) error {
        // delete the vlan subinterface
        dummyLink, err := ns.NlHandle().LinkByName(linkName)
        if err != nil {
                return fmt.Errorf("failed to find link %s on the Docker host : %v", linkName, err)
        }
        // verify a parent interface is being deleted
        if dummyLink.Attrs().ParentIndex != 0 {
                return fmt.Errorf("link %s is not a parent dummy interface", linkName)
        }
        // delete the macvlan dummy device
        if err := ns.NlHandle().LinkDel(dummyLink); err != nil {
                return fmt.Errorf("failed to delete the dummy %s link: %v", linkName, err)
        }
        log.G(context.TODO()).Debugf("Deleted a dummy parent link: %s", linkName)

        return nil
}

// getDummyName returns the name of a dummy parent with truncated net ID and driver prefix
func getDummyName(netID string) string {
        if len(netID) > dummyIDLength {
                netID = netID[:dummyIDLength]
        }
        return dummyPrefix + netID
}

//go:build linux

package macvlan

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
)

func (d *driver) network(nid string) *network {
        d.Lock()
        n, ok := d.networks[nid]
        d.Unlock()
        if !ok {
                log.G(context.TODO()).Errorf("network id %s not found", nid)
        }

        return n
}

func (d *driver) addNetwork(n *network) {
        d.Lock()
        d.networks[n.id] = n
        d.Unlock()
}

func (d *driver) deleteNetwork(nid string) {
        d.Lock()
        delete(d.networks, nid)
        d.Unlock()
}

// getNetworks Safely returns a slice of existing networks
func (d *driver) getNetworks() []*network {
        d.Lock()
        defer d.Unlock()

        ls := make([]*network, 0, len(d.networks))
        for _, nw := range d.networks {
                ls = append(ls, nw)
        }

        return ls
}

func (n *network) endpoint(eid string) *endpoint {
        n.Lock()
        defer n.Unlock()

        return n.endpoints[eid]
}

func (n *network) addEndpoint(ep *endpoint) {
        n.Lock()
        n.endpoints[ep.id] = ep
        n.Unlock()
}

func (n *network) deleteEndpoint(eid string) {
        n.Lock()
        delete(n.endpoints, eid)
        n.Unlock()
}

func (n *network) getEndpoint(eid string) (*endpoint, error) {
        n.Lock()
        defer n.Unlock()
        if eid == "" {
                return nil, fmt.Errorf("endpoint id %s not found", eid)
        }
        if ep, ok := n.endpoints[eid]; ok {
                return ep, nil
        }

        return nil, nil
}

func validateID(nid, eid string) error {
        if nid == "" {
                return errors.New("invalid network id")
        }
        if eid == "" {
                return errors.New("invalid endpoint id")
        }
        return nil
}

func (d *driver) getNetwork(id string) (*network, error) {
        d.Lock()
        defer d.Unlock()
        if id == "" {
                return nil, types.InvalidParameterErrorf("invalid network id: %s", id)
        }
        if nw, ok := d.networks[id]; ok {
                return nw, nil
        }

        return nil, types.NotFoundErrorf("network not found: %s", id)
}

//go:build linux

package macvlan

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        macvlanPrefix         = "macvlan"
        macvlanNetworkPrefix  = macvlanPrefix + "/network"
        macvlanEndpointPrefix = macvlanPrefix + "/endpoint"
)

// networkConfiguration for this driver's network specific configuration
type configuration struct {
        ID               string
        Mtu              int
        dbIndex          uint64
        dbExists         bool
        Internal         bool
        Parent           string
        MacvlanMode      string
        CreatedSlaveLink bool
        Ipv4Subnets      []*ipSubnet
        Ipv6Subnets      []*ipSubnet
}

type ipSubnet struct {
        SubnetIP string
        GwIP     string
}

// initStore drivers are responsible for caching their own persistent state
func (d *driver) initStore() error {
        err := d.populateNetworks()
        if err != nil {
                return err
        }
        err = d.populateEndpoints()
        if err != nil {
                return err
        }

        return nil
}

// populateNetworks is invoked at driver init to recreate persistently stored networks
func (d *driver) populateNetworks() error {
        kvol, err := d.store.List(&configuration{})
        if err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                return fmt.Errorf("failed to get macvlan network configurations from store: %w", err)
        }
        // If empty it simply means no macvlan networks have been created yet
        if errors.Is(err, datastore.ErrKeyNotFound) {
                return nil
        }
        for _, kvo := range kvol {
                config := kvo.(*configuration)
                if _, err = d.createNetwork(config); err != nil {
                        log.G(context.TODO()).Warnf("Could not create macvlan network for id %s from persistent state", config.ID)
                }
        }

        return nil
}

func (d *driver) populateEndpoints() error {
        kvol, err := d.store.List(&endpoint{})
        if err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                return fmt.Errorf("failed to get macvlan endpoints from store: %w", err)
        }

        if errors.Is(err, datastore.ErrKeyNotFound) {
                return nil
        }

        for _, kvo := range kvol {
                ep := kvo.(*endpoint)
                n, ok := d.networks[ep.nid]
                if !ok {
                        log.G(context.TODO()).Debugf("Network (%.7s) not found for restored macvlan endpoint (%.7s)", ep.nid, ep.id)
                        log.G(context.TODO()).Debugf("Deleting stale macvlan endpoint (%.7s) from store", ep.id)
                        if err := d.storeDelete(ep); err != nil {
                                log.G(context.TODO()).Debugf("Failed to delete stale macvlan endpoint (%.7s) from store", ep.id)
                        }
                        continue
                }
                n.endpoints[ep.id] = ep
                log.G(context.TODO()).Debugf("Endpoint (%.7s) restored to network (%.7s)", ep.id, ep.nid)
        }

        return nil
}

// storeUpdate used to update persistent macvlan network records as they are created
func (d *driver) storeUpdate(kvObject datastore.KVObject) error {
        if d.store == nil {
                log.G(context.TODO()).Warnf("macvlan store not initialized. kv object %s is not added to the store", datastore.Key(kvObject.Key()...))
                return nil
        }
        if err := d.store.PutObjectAtomic(kvObject); err != nil {
                return fmt.Errorf("failed to update macvlan store for object type %T: %v", kvObject, err)
        }

        return nil
}

// storeDelete used to delete macvlan records from persistent cache as they are deleted
func (d *driver) storeDelete(kvObject datastore.KVObject) error {
        if d.store == nil {
                log.G(context.TODO()).Debugf("macvlan store not initialized. kv object %s is not deleted from store", datastore.Key(kvObject.Key()...))
                return nil
        }

        return d.store.DeleteObject(kvObject)
}

func (config *configuration) MarshalJSON() ([]byte, error) {
        nMap := make(map[string]interface{})
        nMap["ID"] = config.ID
        nMap["Mtu"] = config.Mtu
        nMap["Parent"] = config.Parent
        nMap["MacvlanMode"] = config.MacvlanMode
        nMap["Internal"] = config.Internal
        nMap["CreatedSubIface"] = config.CreatedSlaveLink
        if len(config.Ipv4Subnets) > 0 {
                iis, err := json.Marshal(config.Ipv4Subnets)
                if err != nil {
                        return nil, err
                }
                nMap["Ipv4Subnets"] = string(iis)
        }
        if len(config.Ipv6Subnets) > 0 {
                iis, err := json.Marshal(config.Ipv6Subnets)
                if err != nil {
                        return nil, err
                }
                nMap["Ipv6Subnets"] = string(iis)
        }

        return json.Marshal(nMap)
}

func (config *configuration) UnmarshalJSON(b []byte) error {
        var (
                err  error
                nMap map[string]interface{}
        )

        if err = json.Unmarshal(b, &nMap); err != nil {
                return err
        }
        config.ID = nMap["ID"].(string)
        config.Mtu = int(nMap["Mtu"].(float64))
        config.Parent = nMap["Parent"].(string)
        config.MacvlanMode = nMap["MacvlanMode"].(string)
        config.Internal = nMap["Internal"].(bool)
        config.CreatedSlaveLink = nMap["CreatedSubIface"].(bool)
        if v, ok := nMap["Ipv4Subnets"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &config.Ipv4Subnets); err != nil {
                        return err
                }
        }
        if v, ok := nMap["Ipv6Subnets"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &config.Ipv6Subnets); err != nil {
                        return err
                }
        }

        return nil
}

func (config *configuration) Key() []string {
        return []string{macvlanNetworkPrefix, config.ID}
}

func (config *configuration) KeyPrefix() []string {
        return []string{macvlanNetworkPrefix}
}

func (config *configuration) Value() []byte {
        b, err := json.Marshal(config)
        if err != nil {
                return nil
        }

        return b
}

func (config *configuration) SetValue(value []byte) error {
        return json.Unmarshal(value, config)
}

func (config *configuration) Index() uint64 {
        return config.dbIndex
}

func (config *configuration) SetIndex(index uint64) {
        config.dbIndex = index
        config.dbExists = true
}

func (config *configuration) Exists() bool {
        return config.dbExists
}

func (config *configuration) Skip() bool {
        return false
}

func (config *configuration) New() datastore.KVObject {
        return &configuration{}
}

func (config *configuration) CopyTo(o datastore.KVObject) error {
        dstNcfg := o.(*configuration)
        *dstNcfg = *config

        return nil
}

func (ep *endpoint) MarshalJSON() ([]byte, error) {
        epMap := make(map[string]interface{})
        epMap["id"] = ep.id
        epMap["nid"] = ep.nid
        epMap["SrcName"] = ep.srcName
        if len(ep.mac) != 0 {
                epMap["MacAddress"] = ep.mac.String()
        }
        if ep.addr != nil {
                epMap["Addr"] = ep.addr.String()
        }
        if ep.addrv6 != nil {
                epMap["Addrv6"] = ep.addrv6.String()
        }
        return json.Marshal(epMap)
}

func (ep *endpoint) UnmarshalJSON(b []byte) error {
        var (
                err   error
                epMap map[string]interface{}
        )

        if err = json.Unmarshal(b, &epMap); err != nil {
                return fmt.Errorf("Failed to unmarshal to macvlan endpoint: %v", err)
        }

        if v, ok := epMap["MacAddress"]; ok {
                if ep.mac, err = net.ParseMAC(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode macvlan endpoint MAC address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        if v, ok := epMap["Addr"]; ok {
                if ep.addr, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode macvlan endpoint IPv4 address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        if v, ok := epMap["Addrv6"]; ok {
                if ep.addrv6, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode macvlan endpoint IPv6 address (%s) after json unmarshal: %v", v.(string), err)
                }
        }
        ep.id = epMap["id"].(string)
        ep.nid = epMap["nid"].(string)
        ep.srcName = epMap["SrcName"].(string)

        return nil
}

func (ep *endpoint) Key() []string {
        return []string{macvlanEndpointPrefix, ep.id}
}

func (ep *endpoint) KeyPrefix() []string {
        return []string{macvlanEndpointPrefix}
}

func (ep *endpoint) Value() []byte {
        b, err := json.Marshal(ep)
        if err != nil {
                return nil
        }
        return b
}

func (ep *endpoint) SetValue(value []byte) error {
        return json.Unmarshal(value, ep)
}

func (ep *endpoint) Index() uint64 {
        return ep.dbIndex
}

func (ep *endpoint) SetIndex(index uint64) {
        ep.dbIndex = index
        ep.dbExists = true
}

func (ep *endpoint) Exists() bool {
        return ep.dbExists
}

func (ep *endpoint) Skip() bool {
        return false
}

func (ep *endpoint) New() datastore.KVObject {
        return &endpoint{}
}

func (ep *endpoint) CopyTo(o datastore.KVObject) error {
        dstEp := o.(*endpoint)
        *dstEp = *ep
        return nil
}

package null

import (
        "context"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const NetworkType = "null"

type driver struct {
        network string
        sync.Mutex
}

// Register registers a new instance of the null driver.
func Register(r driverapi.Registerer) error {
        return r.RegisterDriver(NetworkType, &driver{}, driverapi.Capability{
                DataScope: scope.Local,
        })
}

func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
        return nil, types.NotImplementedErrorf("not implemented")
}

func (d *driver) NetworkFree(id string) error {
        return types.NotImplementedErrorf("not implemented")
}

func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key string, value []byte) {
}

func (d *driver) DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string) {
        return "", nil
}

func (d *driver) CreateNetwork(ctx context.Context, id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
        d.Lock()
        defer d.Unlock()

        if d.network != "" {
                return types.ForbiddenErrorf("only one instance of %q network is allowed", NetworkType)
        }

        d.network = id

        return nil
}

func (d *driver) DeleteNetwork(nid string) error {
        return types.ForbiddenErrorf("network of type %q cannot be deleted", NetworkType)
}

func (d *driver) CreateEndpoint(_ context.Context, nid, eid string, ifInfo driverapi.InterfaceInfo, epOptions map[string]interface{}) error {
        return nil
}

func (d *driver) DeleteEndpoint(nid, eid string) error {
        return nil
}

func (d *driver) EndpointOperInfo(nid, eid string) (map[string]interface{}, error) {
        return make(map[string]interface{}), nil
}

// Join method is invoked when a Sandbox is attached to an endpoint.
func (d *driver) Join(_ context.Context, nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, _, _ map[string]interface{}) error {
        return nil
}

// Leave method is invoked when a Sandbox detaches from an endpoint.
func (d *driver) Leave(nid, eid string) error {
        return nil
}

func (d *driver) Type() string {
        return NetworkType
}

func (d *driver) IsBuiltIn() bool {
        return true
}

package overlay

import (
        "fmt"
        "strconv"
        "strings"

        "golang.org/x/net/bpf"
)

// vniMatchBPF returns a BPF program suitable for passing to the iptables and
// ip6tables bpf match which matches on the VXAN Network ID of encapsulated
// packets. The program assumes that it will be used in a rule which only
// matches UDP datagrams.
func vniMatchBPF(vni uint32) []bpf.RawInstruction {
        asm, err := bpf.Assemble([]bpf.Instruction{
                // Load offset of UDP payload into X.
                bpf.LoadExtension{Num: bpf.ExtPayloadOffset}, // ld poff
                bpf.TAX{}, // tax

                bpf.LoadIndirect{Off: 4, Size: 4},                      // ld [x + 4] ; Load VXLAN ID into top 24 bits of A
                bpf.ALUOpConstant{Op: bpf.ALUOpShiftRight, Val: 8},     // rsh #8     ; A >>= 8
                bpf.JumpIf{Cond: bpf.JumpEqual, Val: vni, SkipTrue: 1}, // jeq $vni, match
                bpf.RetConstant{Val: 0},                                // ret #0
                bpf.RetConstant{Val: ^uint32(0)},                       // match: ret #-1
        })
        // bpf.Assemble() only errors if an instruction is invalid. As the only variable
        // part of the program is an instruction value for which the entire range is
        // valid, whether the program can be successfully assembled is independent of
        // the input. Given that the only recourse is to fix this function and
        // recompile, there's little value in bubbling the error up to the caller.
        if err != nil {
                panic(err)
        }
        return asm
}

// marshalXTBPF marshals a BPF program into the "decimal" byte code format
// which is suitable for passing to the [iptables bpf match].
//
//        iptables -m bpf --bytecode
//
// [iptables bpf match]: https://ipset.netfilter.org/iptables-extensions.man.html#lbAH
func marshalXTBPF(prog []bpf.RawInstruction) string { //nolint:unused
        var b strings.Builder
        fmt.Fprintf(&b, "%d", len(prog))
        for _, ins := range prog {
                fmt.Fprintf(&b, ",%d %d %d %d", ins.Op, ins.Jt, ins.Jf, ins.K)
        }
        return b.String()
}

// matchVXLAN returns an iptables rule fragment which matches VXLAN datagrams
// with the given destination port and VXLAN Network ID utilizing the xt_bpf
// netfilter kernel module. The returned slice's backing array is guaranteed not
// to alias any other slice's.
func matchVXLAN(port, vni uint32) []string {
        dport := strconv.FormatUint(uint64(port), 10)
        vniMatch := marshalXTBPF(vniMatchBPF(vni))

        // https://ipset.netfilter.org/iptables-extensions.man.html#lbAH
        return []string{"-p", "udp", "--dport", dport, "-m", "bpf", "--bytecode", vniMatch}
}

//go:build linux

package overlay

import (
        "bytes"
        "context"
        "encoding/binary"
        "encoding/hex"
        "errors"
        "fmt"
        "hash/fnv"
        "net"
        "net/netip"
        "strconv"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/overlay/overlayutils"
        "github.com/docker/docker/daemon/libnetwork/iptables"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/vishvananda/netlink"
)

/*
Encrypted overlay networks use IPsec in transport mode to encrypt and
authenticate the VXLAN UDP datagrams. This driver implements a bespoke control
plane which negotiates the security parameters for each peer-to-peer tunnel.

IPsec Terminology

 - ESP: IPSec Encapsulating Security Payload
 - SPI: Security Parameter Index
 - ICV: Integrity Check Value
 - SA: Security Association https://en.wikipedia.org/wiki/IPsec#Security_association


Developer documentation for Linux IPsec is rather sparse online. The following
slide deck provides a decent overview.
https://libreswan.org/wiki/images/e/e0/Netdev-0x12-ipsec-flow.pdf

The Linux IPsec stack is part of XFRM, the netlink packet transformation
interface.
https://man7.org/linux/man-pages/man8/ip-xfrm.8.html
*/

const (
        // Value used to mark outgoing packets which should have our IPsec
        // processing applied. It is also used as a label to identify XFRM
        // states (Security Associations) and policies (Security Policies)
        // programmed by us so we know which ones we can clean up without
        // disrupting other VPN connections on the system.
        mark = 0xD0C4E3

        pktExpansion = 26 // SPI(4) + SeqN(4) + IV(8) + PadLength(1) + NextHeader(1) + ICV(8)
)

const (
        forward = iota + 1
        reverse
        bidir
)

// Mark value for matching packets which should have our IPsec security policy
// applied.
var spMark = netlink.XfrmMark{Value: mark, Mask: 0xffffffff}

type key struct {
        value []byte
        tag   uint32
}

func (k *key) String() string {
        if k != nil {
                return fmt.Sprintf("(key: %s, tag: 0x%x)", hex.EncodeToString(k.value)[0:5], k.tag)
        }
        return ""
}

// Security Parameter Indices for the IPsec flows between local node and a
// remote peer, which identify the Security Associations (XFRM states) to be
// applied when encrypting and decrypting packets.
type spi struct {
        forward int
        reverse int
}

func (s *spi) String() string {
        return fmt.Sprintf("SPI(FWD: 0x%x, REV: 0x%x)", uint32(s.forward), uint32(s.reverse))
}

type encrNode struct {
        spi   []spi
        count int
}

// encrMap is a map of node IP addresses to their encryption parameters.
//
// Like all Go maps, it is not safe for concurrent use.
type encrMap map[netip.Addr]encrNode

func (e encrMap) String() string {
        b := new(bytes.Buffer)
        for k, v := range e {
                b.WriteString("\n")
                b.WriteString(k.String())
                b.WriteString(":")
                b.WriteString("[")
                for _, s := range v.spi {
                        b.WriteString(s.String())
                        b.WriteString(",")
                }
                b.WriteString("]")
        }
        return b.String()
}

// setupEncryption programs the encryption parameters for secure communication
// between the local node and a remote node.
func (d *driver) setupEncryption(remoteIP netip.Addr) error {
        log.G(context.TODO()).Debugf("setupEncryption(%s)", remoteIP)

        d.encrMu.Lock()
        defer d.encrMu.Unlock()
        if len(d.keys) == 0 {
                return types.ForbiddenErrorf("encryption key is not present")
        }
        d.mu.Lock()
        localIP, advIP := d.bindAddress, d.advertiseAddress
        d.mu.Unlock()
        log.G(context.TODO()).Debugf("Programming encryption between %s and %s", localIP, remoteIP)

        indices := make([]spi, 0, len(d.keys))

        for i, k := range d.keys {
                spis := spi{buildSPI(advIP.AsSlice(), remoteIP.AsSlice(), k.tag), buildSPI(remoteIP.AsSlice(), advIP.AsSlice(), k.tag)}
                dir := reverse
                if i == 0 {
                        dir = bidir
                }
                fSA, rSA, err := programSA(localIP.AsSlice(), remoteIP.AsSlice(), spis, k, dir, true)
                if err != nil {
                        log.G(context.TODO()).Warn(err)
                }
                indices = append(indices, spis)
                if i != 0 {
                        continue
                }
                err = programSP(fSA, rSA, true)
                if err != nil {
                        log.G(context.TODO()).Warn(err)
                }
        }

        node := d.secMap[remoteIP]
        node.spi = indices
        node.count++
        d.secMap[remoteIP] = node

        return nil
}

func (d *driver) removeEncryption(remoteIP netip.Addr) error {
        log.G(context.TODO()).Debugf("removeEncryption(%s)", remoteIP)

        d.encrMu.Lock()
        defer d.encrMu.Unlock()

        var spi []spi
        node := d.secMap[remoteIP]
        if node.count == 1 {
                delete(d.secMap, remoteIP)
                spi = node.spi
        } else {
                node.count--
                d.secMap[remoteIP] = node
        }

        for i, idxs := range spi {
                dir := reverse
                if i == 0 {
                        dir = bidir
                }
                fSA, rSA, err := programSA(d.bindAddress.AsSlice(), remoteIP.AsSlice(), idxs, nil, dir, false)
                if err != nil {
                        log.G(context.TODO()).Warn(err)
                }
                if i != 0 {
                        continue
                }
                err = programSP(fSA, rSA, false)
                if err != nil {
                        log.G(context.TODO()).Warn(err)
                }
        }
        return nil
}

func (d *driver) transportIPTable() (*iptables.IPTable, error) {
        v6, err := d.isIPv6Transport()
        if err != nil {
                return nil, err
        }
        version := iptables.IPv4
        if v6 {
                version = iptables.IPv6
        }
        return iptables.GetIptable(version), nil
}

func (d *driver) programMangle(vni uint32, add bool) error {
        var (
                m      = strconv.FormatUint(mark, 10)
                chain  = "OUTPUT"
                rule   = append(matchVXLAN(overlayutils.VXLANUDPPort(), vni), "-j", "MARK", "--set-mark", m)
                a      = iptables.Append
                action = "install"
        )

        iptable, err := d.transportIPTable()
        if err != nil {
                // Fail closed if unsure. Better safe than cleartext.
                return err
        }

        if !add {
                a = iptables.Delete
                action = "remove"
        }

        if err := iptable.ProgramRule(iptables.Mangle, chain, a, rule); err != nil {
                return fmt.Errorf("could not %s mangle rule: %w", action, err)
        }

        return nil
}

func (d *driver) programInput(vni uint32, add bool) error {
        var (
                plainVxlan = matchVXLAN(overlayutils.VXLANUDPPort(), vni)
                chain      = "INPUT"
                msg        = "add"
        )

        rule := func(policy, jump string) []string {
                args := append([]string{"-m", "policy", "--dir", "in", "--pol", policy}, plainVxlan...)
                return append(args, "-j", jump)
        }

        iptable, err := d.transportIPTable()
        if err != nil {
                // Fail closed if unsure. Better safe than cleartext.
                return err
        }

        if !add {
                msg = "remove"
        }

        action := func(a iptables.Action) iptables.Action {
                if !add {
                        return iptables.Delete
                }
                return a
        }

        // Drop incoming VXLAN datagrams for the VNI which were received in cleartext.
        // Insert at the top of the chain so the packets are dropped even if an
        // administrator-configured rule exists which would otherwise unconditionally
        // accept incoming VXLAN traffic.
        if err := iptable.ProgramRule(iptables.Filter, chain, action(iptables.Insert), rule("none", "DROP")); err != nil {
                return fmt.Errorf("could not %s input drop rule: %w", msg, err)
        }

        return nil
}

func programSA(localIP, remoteIP net.IP, spi spi, k *key, dir int, add bool) (fSA *netlink.XfrmState, rSA *netlink.XfrmState, lastErr error) {
        var (
                action      = "Removing"
                xfrmProgram = ns.NlHandle().XfrmStateDel
        )

        if add {
                action = "Adding"
                xfrmProgram = ns.NlHandle().XfrmStateAdd
        }

        if dir&reverse > 0 {
                rSA = &netlink.XfrmState{
                        Src:   remoteIP,
                        Dst:   localIP,
                        Proto: netlink.XFRM_PROTO_ESP,
                        Spi:   spi.reverse,
                        Mode:  netlink.XFRM_MODE_TRANSPORT,
                        Reqid: mark,
                }
                if add {
                        rSA.Aead = buildAeadAlgo(k, spi.reverse)
                }

                exists, err := saExists(rSA)
                if err != nil {
                        lastErr = err
                        exists = !add
                }

                if add != exists {
                        log.G(context.TODO()).Debugf("%s: rSA{%s}", action, rSA)
                        if err := xfrmProgram(rSA); err != nil {
                                log.G(context.TODO()).Warnf("Failed %s rSA{%s}: %v", action, rSA, err)
                        }
                }
        }

        if dir&forward > 0 {
                fSA = &netlink.XfrmState{
                        Src:   localIP,
                        Dst:   remoteIP,
                        Proto: netlink.XFRM_PROTO_ESP,
                        Spi:   spi.forward,
                        Mode:  netlink.XFRM_MODE_TRANSPORT,
                        Reqid: mark,
                }
                if add {
                        fSA.Aead = buildAeadAlgo(k, spi.forward)
                }

                exists, err := saExists(fSA)
                if err != nil {
                        lastErr = err
                        exists = !add
                }

                if add != exists {
                        log.G(context.TODO()).Debugf("%s fSA{%s}", action, fSA)
                        if err := xfrmProgram(fSA); err != nil {
                                log.G(context.TODO()).Warnf("Failed %s fSA{%s}: %v.", action, fSA, err)
                        }
                }
        }

        return fSA, rSA, lastErr
}

// getMinimalIP returns the address in its shortest form
// If ip contains an IPv4-mapped IPv6 address, the 4-octet form of the IPv4 address will be returned.
// Otherwise ip is returned unchanged.
func getMinimalIP(ip net.IP) net.IP {
        if ip != nil && ip.To4() != nil {
                return ip.To4()
        }
        return ip
}

func programSP(fSA *netlink.XfrmState, rSA *netlink.XfrmState, add bool) error {
        action := "Removing"
        xfrmProgram := ns.NlHandle().XfrmPolicyDel
        if add {
                action = "Adding"
                xfrmProgram = ns.NlHandle().XfrmPolicyAdd
        }

        // Create a congruent cidr
        s := getMinimalIP(fSA.Src)
        d := getMinimalIP(fSA.Dst)
        fullMask := net.CIDRMask(8*len(s), 8*len(s))

        fPol := &netlink.XfrmPolicy{
                Src:     &net.IPNet{IP: s, Mask: fullMask},
                Dst:     &net.IPNet{IP: d, Mask: fullMask},
                Dir:     netlink.XFRM_DIR_OUT,
                Proto:   syscall.IPPROTO_UDP,
                DstPort: int(overlayutils.VXLANUDPPort()),
                Mark:    &spMark,
                Tmpls: []netlink.XfrmPolicyTmpl{
                        {
                                Src:   fSA.Src,
                                Dst:   fSA.Dst,
                                Proto: netlink.XFRM_PROTO_ESP,
                                Mode:  netlink.XFRM_MODE_TRANSPORT,
                                Spi:   fSA.Spi,
                                Reqid: mark,
                        },
                },
        }

        exists, err := spExists(fPol)
        if err != nil {
                exists = !add
        }

        if add != exists {
                log.G(context.TODO()).Debugf("%s fSP{%s}", action, fPol)
                if err := xfrmProgram(fPol); err != nil {
                        log.G(context.TODO()).Warnf("%s fSP{%s}: %v", action, fPol, err)
                }
        }

        return nil
}

func saExists(sa *netlink.XfrmState) (bool, error) {
        _, err := ns.NlHandle().XfrmStateGet(sa)
        switch {
        case err == nil:
                return true, nil
        case errors.Is(err, syscall.ESRCH):
                return false, nil
        default:
                err = fmt.Errorf("Error while checking for SA existence: %v", err)
                log.G(context.TODO()).Warn(err)
                return false, err
        }
}

func spExists(sp *netlink.XfrmPolicy) (bool, error) {
        _, err := ns.NlHandle().XfrmPolicyGet(sp)
        switch {
        case err == nil:
                return true, nil
        case errors.Is(err, syscall.ENOENT):
                return false, nil
        default:
                err = fmt.Errorf("Error while checking for SP existence: %v", err)
                log.G(context.TODO()).Warn(err)
                return false, err
        }
}

func buildSPI(src, dst net.IP, st uint32) int {
        b := make([]byte, 4)
        binary.BigEndian.PutUint32(b, st)
        h := fnv.New32a()
        h.Write(src)
        h.Write(b)
        h.Write(dst)
        return int(binary.BigEndian.Uint32(h.Sum(nil)))
}

func buildAeadAlgo(k *key, s int) *netlink.XfrmStateAlgo {
        salt := make([]byte, 4)
        binary.BigEndian.PutUint32(salt, uint32(s))
        return &netlink.XfrmStateAlgo{
                Name:   "rfc4106(gcm(aes))",
                Key:    append(k.value, salt...),
                ICVLen: 64,
        }
}

func (d *driver) setKeys(keys []*key) error {
        d.encrMu.Lock()
        defer d.encrMu.Unlock()

        // Remove any stale policy, state
        clearEncryptionStates()
        // Accept the encryption keys and clear any stale encryption map
        d.secMap = encrMap{}
        d.keys = keys
        log.G(context.TODO()).Debugf("Initial encryption keys: %v", keys)
        return nil
}

// updateKeys allows to add a new key and/or change the primary key and/or prune an existing key
// The primary key is the key used in transmission and will go in first position in the list.
func (d *driver) updateKeys(newKey, primary, pruneKey *key) error {
        d.encrMu.Lock()
        defer d.encrMu.Unlock()

        log.G(context.TODO()).Debugf("Updating Keys. New: %v, Primary: %v, Pruned: %v", newKey, primary, pruneKey)

        log.G(context.TODO()).Debugf("Current: %v", d.keys)

        var (
                newIdx = -1
                priIdx = -1
                delIdx = -1
                lIP    = d.bindAddress
                aIP    = d.advertiseAddress
        )

        // add new
        if newKey != nil {
                d.keys = append(d.keys, newKey)
                newIdx += len(d.keys)
        }
        for i, k := range d.keys {
                if primary != nil && k.tag == primary.tag {
                        priIdx = i
                }
                if pruneKey != nil && k.tag == pruneKey.tag {
                        delIdx = i
                }
        }

        if (newKey != nil && newIdx == -1) ||
                (primary != nil && priIdx == -1) ||
                (pruneKey != nil && delIdx == -1) {
                return types.InvalidParameterErrorf("cannot find proper key indices while processing key update:"+
                        "(newIdx,priIdx,delIdx):(%d, %d, %d)", newIdx, priIdx, delIdx)
        }

        if priIdx != -1 && priIdx == delIdx {
                return types.InvalidParameterErrorf("attempting to both make a key (index %d) primary and delete it", priIdx)
        }

        for rIP, node := range d.secMap {
                idxs := updateNodeKey(lIP.AsSlice(), aIP.AsSlice(), rIP.AsSlice(), node.spi, d.keys, newIdx, priIdx, delIdx)
                if idxs != nil {
                        d.secMap[rIP] = encrNode{idxs, node.count}
                }
        }

        // swap primary
        if priIdx != -1 {
                d.keys[0], d.keys[priIdx] = d.keys[priIdx], d.keys[0]
        }
        // prune
        if delIdx != -1 {
                if delIdx == 0 {
                        delIdx = priIdx
                }
                d.keys = append(d.keys[:delIdx], d.keys[delIdx+1:]...)
        }

        log.G(context.TODO()).Debugf("Updated: %v", d.keys)

        return nil
}

/********************************************************
 * Steady state: rSA0, rSA1, rSA2, fSA1, fSP1
 * Rotation --> -rSA0, +rSA3, +fSA2, +fSP2/-fSP1, -fSA1
 * Steady state: rSA1, rSA2, rSA3, fSA2, fSP2
 *********************************************************/

// Spis and keys are sorted in such away the one in position 0 is the primary
func updateNodeKey(lIP, aIP, rIP net.IP, idxs []spi, curKeys []*key, newIdx, priIdx, delIdx int) []spi {
        log.G(context.TODO()).Debugf("Updating keys for node: %s (%d,%d,%d)", rIP, newIdx, priIdx, delIdx)

        spis := idxs
        log.G(context.TODO()).Debugf("Current: %v", spis)

        // add new
        if newIdx != -1 {
                spis = append(spis, spi{
                        forward: buildSPI(aIP, rIP, curKeys[newIdx].tag),
                        reverse: buildSPI(rIP, aIP, curKeys[newIdx].tag),
                })
        }

        if delIdx != -1 {
                // -rSA0
                programSA(lIP, rIP, spis[delIdx], nil, reverse, false)
        }

        if newIdx > -1 {
                // +rSA2
                programSA(lIP, rIP, spis[newIdx], curKeys[newIdx], reverse, true)
        }

        if priIdx > 0 {
                // +fSA2
                fSA2, _, _ := programSA(lIP, rIP, spis[priIdx], curKeys[priIdx], forward, true)

                // +fSP2, -fSP1
                s := getMinimalIP(fSA2.Src)
                d := getMinimalIP(fSA2.Dst)
                fullMask := net.CIDRMask(8*len(s), 8*len(s))

                fSP1 := &netlink.XfrmPolicy{
                        Src:     &net.IPNet{IP: s, Mask: fullMask},
                        Dst:     &net.IPNet{IP: d, Mask: fullMask},
                        Dir:     netlink.XFRM_DIR_OUT,
                        Proto:   syscall.IPPROTO_UDP,
                        DstPort: int(overlayutils.VXLANUDPPort()),
                        Mark:    &spMark,
                        Tmpls: []netlink.XfrmPolicyTmpl{
                                {
                                        Src:   fSA2.Src,
                                        Dst:   fSA2.Dst,
                                        Proto: netlink.XFRM_PROTO_ESP,
                                        Mode:  netlink.XFRM_MODE_TRANSPORT,
                                        Spi:   fSA2.Spi,
                                        Reqid: mark,
                                },
                        },
                }
                log.G(context.TODO()).Debugf("Updating fSP{%s}", fSP1)
                if err := ns.NlHandle().XfrmPolicyUpdate(fSP1); err != nil {
                        log.G(context.TODO()).Warnf("Failed to update fSP{%s}: %v", fSP1, err)
                }

                // -fSA1
                programSA(lIP, rIP, spis[0], nil, forward, false)
        }

        // swap
        if priIdx > 0 {
                swp := spis[0]
                spis[0] = spis[priIdx]
                spis[priIdx] = swp
        }
        // prune
        if delIdx != -1 {
                if delIdx == 0 {
                        delIdx = priIdx
                }
                spis = append(spis[:delIdx], spis[delIdx+1:]...)
        }

        log.G(context.TODO()).Debugf("Updated: %v", spis)

        return spis
}

func (n *network) maxMTU() int {
        mtu := 1500
        if n.mtu != 0 {
                mtu = n.mtu
        }
        mtu -= vxlanEncap
        if n.secure {
                // In case of encryption account for the
                // esp packet expansion and padding
                mtu -= pktExpansion
                mtu -= (mtu % 4)
        }
        return mtu
}

func clearEncryptionStates() {
        nlh := ns.NlHandle()
        spList, err := nlh.XfrmPolicyList(netlink.FAMILY_ALL)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to retrieve SP list for cleanup: %v", err)
        }
        saList, err := nlh.XfrmStateList(netlink.FAMILY_ALL)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to retrieve SA list for cleanup: %v", err)
        }
        for _, sp := range spList {
                if sp.Mark != nil && sp.Mark.Value == spMark.Value {
                        if err := nlh.XfrmPolicyDel(&sp); err != nil {
                                log.G(context.TODO()).Warnf("Failed to delete stale SP %s: %v", sp, err)
                                continue
                        }
                        log.G(context.TODO()).Debugf("Removed stale SP: %s", sp)
                }
        }
        for _, sa := range saList {
                if sa.Reqid == mark {
                        if err := nlh.XfrmStateDel(&sa); err != nil {
                                log.G(context.TODO()).Warnf("Failed to delete stale SA %s: %v", sa, err)
                                continue
                        }
                        log.G(context.TODO()).Debugf("Removed stale SA: %s", sa)
                }
        }
}

//go:build linux

package overlay

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/netip"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/gogo/protobuf/proto"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

// Join method is invoked when a Sandbox is attached to an endpoint.
func (d *driver) Join(ctx context.Context, nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, epOpts, _ map[string]interface{}) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.drivers.overlay.Join", trace.WithAttributes(
                attribute.String("nid", nid),
                attribute.String("eid", eid),
                attribute.String("sboxKey", sboxKey)))
        defer span.End()

        if err := validateID(nid, eid); err != nil {
                return err
        }

        n, unlock, err := d.lockNetwork(nid)
        if err != nil {
                return err
        }
        defer unlock()

        ep := n.endpoints[eid]
        if ep == nil {
                return fmt.Errorf("could not find endpoint with id %s", eid)
        }

        if n.secure {
                d.encrMu.Lock()
                nkeys := len(d.keys)
                d.encrMu.Unlock()
                if nkeys == 0 {
                        return errors.New("cannot join secure network: encryption keys not present")
                }
        }

        nlh := ns.NlHandle()

        if n.secure && !nlh.SupportsNetlinkFamily(syscall.NETLINK_XFRM) {
                return errors.New("cannot join secure network: required modules to install IPSEC rules are missing on host")
        }

        s := n.getSubnetforIP(ep.addr)
        if s == nil {
                return fmt.Errorf("could not find subnet for endpoint %s", eid)
        }

        if err := n.joinSandbox(s, true); err != nil {
                return fmt.Errorf("network sandbox join failed: %v", err)
        }

        overlayIfName, containerIfName, err := createVethPair()
        if err != nil {
                return err
        }

        ep.ifName = containerIfName

        // Set the container interface and its peer MTU to 1450 to allow
        // for 50 bytes vxlan encap (inner eth header(14) + outer IP(20) +
        // outer UDP(8) + vxlan header(8))
        mtu := n.maxMTU()

        veth, err := nlh.LinkByName(overlayIfName)
        if err != nil {
                return fmt.Errorf("could not find link by name %s: %v", overlayIfName, err)
        }
        err = nlh.LinkSetMTU(veth, mtu)
        if err != nil {
                return err
        }

        if err = n.sbox.AddInterface(ctx, overlayIfName, "veth", "", osl.WithMaster(s.brName)); err != nil {
                return fmt.Errorf("could not add veth pair inside the network sandbox: %v", err)
        }

        veth, err = nlh.LinkByName(containerIfName)
        if err != nil {
                return fmt.Errorf("could not find link by name %s: %v", containerIfName, err)
        }
        err = nlh.LinkSetMTU(veth, mtu)
        if err != nil {
                return err
        }

        if err = nlh.LinkSetHardwareAddr(veth, ep.mac); err != nil {
                return fmt.Errorf("could not set mac address (%v) to the container interface: %v", ep.mac, err)
        }

        for _, sub := range n.subnets {
                if sub == s {
                        continue
                }
                if err = jinfo.AddStaticRoute(netiputil.ToIPNet(sub.subnetIP), types.NEXTHOP, s.gwIP.Addr().AsSlice()); err != nil {
                        log.G(ctx).Errorf("Adding subnet %s static route in network %q failed\n", s.subnetIP, n.id)
                }
        }

        if iNames := jinfo.InterfaceName(); iNames != nil {
                err = iNames.SetNames(containerIfName, "eth", netlabel.GetIfname(epOpts))
                if err != nil {
                        return err
                }
        }

        if err := n.peerAdd(eid, ep.addr, ep.mac, netip.Addr{}); err != nil {
                return fmt.Errorf("overlay: failed to add local endpoint to network peer db: %w", err)
        }

        buf, err := proto.Marshal(&PeerRecord{
                EndpointIP:       ep.addr.String(),
                EndpointMAC:      ep.mac.String(),
                TunnelEndpointIP: d.advertiseAddress.String(),
        })
        if err != nil {
                return err
        }

        if err := jinfo.AddTableEntry(ovPeerTable, eid, buf); err != nil {
                log.G(ctx).Errorf("overlay: Failed adding table entry to joininfo: %v", err)
        }

        return nil
}

func (d *driver) DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string) {
        if tablename != ovPeerTable {
                log.G(context.TODO()).Errorf("DecodeTableEntry: unexpected table name %s", tablename)
                return "", nil
        }

        var peer PeerRecord
        if err := proto.Unmarshal(value, &peer); err != nil {
                log.G(context.TODO()).Errorf("DecodeTableEntry: failed to unmarshal peer record for key %s: %v", key, err)
                return "", nil
        }

        return key, map[string]string{
                "Host IP": peer.TunnelEndpointIP,
        }
}

func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key string, value []byte) {
        if tableName != ovPeerTable {
                log.G(context.TODO()).Errorf("Unexpected table notification for table %s received", tableName)
                return
        }

        eid := key

        var peer PeerRecord
        if err := proto.Unmarshal(value, &peer); err != nil {
                log.G(context.TODO()).Errorf("Failed to unmarshal peer record: %v", err)
                return
        }

        // Ignore local peers. We already know about them and they
        // should not be added to vxlan fdb.
        if addr, _ := netip.ParseAddr(peer.TunnelEndpointIP); addr == d.advertiseAddress {
                return
        }

        addr, err := netip.ParsePrefix(peer.EndpointIP)
        if err != nil {
                log.G(context.TODO()).WithError(err).Errorf("Invalid peer IP %s received in event notify", peer.EndpointIP)
                return
        }

        mac, err := net.ParseMAC(peer.EndpointMAC)
        if err != nil {
                log.G(context.TODO()).WithError(err).Errorf("Invalid mac %s received in event notify", peer.EndpointMAC)
                return
        }

        vtep, err := netip.ParseAddr(peer.TunnelEndpointIP)
        if err != nil {
                log.G(context.TODO()).WithError(err).Errorf("Invalid VTEP %s received in event notify", peer.TunnelEndpointIP)
                return
        }

        n, unlock, err := d.lockNetwork(nid)
        if err != nil {
                log.G(context.TODO()).WithFields(log.Fields{
                        "error": err,
                        "nid":   nid,
                }).Error("overlay: handling peer event")
                return
        }
        defer unlock()

        var opname string
        if etype == driverapi.Delete {
                opname = "delete"
                err = n.peerDelete(eid, addr, mac, vtep)
        } else {
                opname = "add"
                err = n.peerAdd(eid, addr, mac, vtep)
        }
        if err != nil {
                log.G(context.TODO()).WithFields(log.Fields{
                        "error": err,
                        "nid":   n.id,
                        "peer":  peer,
                        "op":    opname,
                }).Warn("Peer operation failed")
        }
}

// Leave method is invoked when a Sandbox detaches from an endpoint.
func (d *driver) Leave(nid, eid string) error {
        if err := validateID(nid, eid); err != nil {
                return err
        }

        n, unlock, err := d.lockNetwork(nid)
        if err != nil {
                return err
        }
        defer unlock()

        ep := n.endpoints[eid]

        if ep == nil {
                return types.InternalMaskableErrorf("could not find endpoint with id %s", eid)
        }

        if err := n.peerDelete(eid, ep.addr, ep.mac, netip.Addr{}); err != nil {
                return fmt.Errorf("overlay: failed to delete local endpoint eid:%s from network peer db: %w", eid, err)
        }

        n.leaveSandbox()

        return nil
}

package overlay

import (
        "strconv"

        "github.com/docker/docker/daemon/libnetwork/osl/kernel"
)

var ovConfig = map[string]*kernel.OSValue{
        "net.ipv4.neigh.default.gc_thresh1": {Value: "8192", CheckFn: checkHigher},
        "net.ipv4.neigh.default.gc_thresh2": {Value: "49152", CheckFn: checkHigher},
        "net.ipv4.neigh.default.gc_thresh3": {Value: "65536", CheckFn: checkHigher},
}

func checkHigher(val1, val2 string) bool {
        val1Int, _ := strconv.ParseInt(val1, 10, 32)
        val2Int, _ := strconv.ParseInt(val2, 10, 32)
        return val1Int < val2Int
}

func applyOStweaks() {
        kernel.ApplyOSTweaks(ovConfig)
}

//go:build linux

package overlay

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/netip"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/ns"
)

type endpointTable map[string]*endpoint

type endpoint struct {
        id     string
        nid    string
        ifName string
        mac    net.HardwareAddr
        addr   netip.Prefix
}

func (d *driver) CreateEndpoint(_ context.Context, nid, eid string, ifInfo driverapi.InterfaceInfo, epOptions map[string]interface{}) error {
        var err error
        if err = validateID(nid, eid); err != nil {
                return err
        }

        // Since we perform lazy configuration make sure we try
        // configuring the driver when we enter CreateEndpoint since
        // CreateNetwork may not be called in every node.
        if err := d.configure(); err != nil {
                return err
        }

        n, unlock, err := d.lockNetwork(nid)
        if err != nil {
                return err
        }
        defer unlock()

        ep := &endpoint{
                id:  eid,
                nid: n.id,
                mac: ifInfo.MacAddress(),
        }
        var ok bool
        ep.addr, ok = netiputil.ToPrefix(ifInfo.Address())
        if !ok {
                return errors.New("create endpoint was not passed interface IP address")
        }

        if s := n.getSubnetforIP(ep.addr); s == nil {
                return fmt.Errorf("no matching subnet for IP %q in network %q", ep.addr, nid)
        }

        if ep.mac == nil {
                ep.mac = netutils.GenerateMACFromIP(ep.addr.Addr().AsSlice())
                if err := ifInfo.SetMacAddress(ep.mac); err != nil {
                        return err
                }
        }

        n.endpoints[ep.id] = ep

        return nil
}

func (d *driver) DeleteEndpoint(nid, eid string) error {
        nlh := ns.NlHandle()

        if err := validateID(nid, eid); err != nil {
                return err
        }

        n, unlock, err := d.lockNetwork(nid)
        if err != nil {
                return err
        }
        defer unlock()

        ep := n.endpoints[eid]
        if ep == nil {
                return fmt.Errorf("endpoint id %q not found", eid)
        }

        delete(n.endpoints, eid)

        if ep.ifName == "" {
                return nil
        }

        link, err := nlh.LinkByName(ep.ifName)
        if err != nil {
                log.G(context.TODO()).Debugf("Failed to retrieve interface (%s)'s link on endpoint (%s) delete: %v", ep.ifName, ep.id, err)
                return nil
        }
        if err := nlh.LinkDel(link); err != nil {
                log.G(context.TODO()).Debugf("Failed to delete interface (%s)'s link on endpoint (%s) delete: %v", ep.ifName, ep.id, err)
        }

        return nil
}

func (d *driver) EndpointOperInfo(nid, eid string) (map[string]interface{}, error) {
        return make(map[string]interface{}), nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23 && linux

package overlay

import (
        "context"
        "errors"
        "fmt"
        "net/netip"
        "os"
        "path/filepath"
        "runtime"
        "strconv"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/drivers/overlay/overlayutils"
        "github.com/docker/docker/daemon/libnetwork/internal/countmap"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/hashicorp/go-multierror"
        "github.com/vishvananda/netlink"
        "github.com/vishvananda/netns"
        "golang.org/x/sys/unix"
)

var (
        networkOnce sync.Once
        networkMu   sync.Mutex
        vniTbl      = make(map[uint32]string)
)

type networkTable map[string]*network

type subnet struct {
        sboxInit  bool
        vxlanName string
        brName    string
        vni       uint32
        initErr   error
        subnetIP  netip.Prefix
        gwIP      netip.Prefix
}

type network struct {
        id     string
        driver *driver
        secure bool
        mtu    int

        // mu must be held when accessing any of the variable struct fields below,
        // calling any method on the network not noted as safe for concurrent use,
        // or manipulating the driver.networks key for this network id.
        // This mutex is at the top of the lock hierarchy: any other locks in
        // package structs can be locked while holding this lock.
        mu        sync.Mutex
        sbox      *osl.Namespace
        endpoints endpointTable
        joinCnt   int
        // Ref count of VXLAN Forwarding Database entries programmed into the kernel
        fdbCnt    countmap.Map[ipmac]
        sboxInit  bool
        initEpoch int
        initErr   error
        subnets   []*subnet
        peerdb    peerMap
}

func init() {
        // Lock main() to the initial thread to exclude the goroutines executing
        // func setDefaultVLAN() from being scheduled onto that thread. Changes to
        // the network namespace of the initial thread alter /proc/self/ns/net,
        // which would break any code which (incorrectly) assumes that /proc/self/ns/net
        // is a handle to the network namespace for the thread it is currently
        // executing on.
        runtime.LockOSThread()
}

func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
        return nil, types.NotImplementedErrorf("not implemented")
}

func (d *driver) NetworkFree(id string) error {
        return types.NotImplementedErrorf("not implemented")
}

func (d *driver) CreateNetwork(ctx context.Context, id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
        if id == "" {
                return errors.New("invalid network id")
        }
        if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
                return types.InvalidParameterErrorf("ipv4 pool is empty")
        }

        // Since we perform lazy configuration make sure we try
        // configuring the driver when we enter CreateNetwork
        if err := d.configure(); err != nil {
                return err
        }

        n := &network{
                id:        id,
                driver:    d,
                endpoints: endpointTable{},
                subnets:   []*subnet{},
                fdbCnt:    countmap.Map[ipmac]{},
        }

        vnis := make([]uint32, 0, len(ipV4Data))
        gval, ok := option[netlabel.GenericData]
        if !ok {
                return fmt.Errorf("option %s is missing", netlabel.GenericData)
        }

        optMap := gval.(map[string]string)
        vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList]
        if !ok {
                return errors.New("no VNI provided")
        }
        log.G(context.TODO()).Debugf("overlay: Received vxlan IDs: %s", vnisOpt)
        var err error
        vnis, err = overlayutils.AppendVNIList(vnis, vnisOpt)
        if err != nil {
                return err
        }

        if _, ok := optMap[secureOption]; ok {
                n.secure = true
        }
        if val, ok := optMap[netlabel.DriverMTU]; ok {
                var err error
                if n.mtu, err = strconv.Atoi(val); err != nil {
                        return fmt.Errorf("failed to parse %v: %v", val, err)
                }
                if n.mtu < 0 {
                        return fmt.Errorf("invalid MTU value: %v", n.mtu)
                }
        }

        if len(vnis) == 0 {
                return errors.New("no VNI provided")
        } else if len(vnis) < len(ipV4Data) {
                return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis))
        }

        for i, ipd := range ipV4Data {
                s := &subnet{vni: vnis[i]}
                s.subnetIP, _ = netiputil.ToPrefix(ipd.Pool)
                s.gwIP, _ = netiputil.ToPrefix(ipd.Gateway)

                n.subnets = append(n.subnets, s)
        }

        // Lock the network before adding it to the networks table so we can
        // release the big driver lock before we finish initializing the network
        // while continuing to exclude other operations on the network from
        // proceeding until we are done.
        n.mu.Lock()
        defer n.mu.Unlock()

        d.mu.Lock()
        oldnet := d.networks[id]
        if oldnet == nil {
                d.networks[id] = n
                d.mu.Unlock()
        } else {
                // The network already exists, but we might be racing DeleteNetwork.
                // Synchronize and check again.
                d.mu.Unlock()
                oldnet.mu.Lock()
                d.mu.Lock()
                _, ok := d.networks[id]
                if !ok {
                        // It's gone! Stake our claim to the network id.
                        d.networks[id] = n
                }
                d.mu.Unlock()
                oldnet.mu.Unlock()
                if ok {
                        return fmt.Errorf("attempt to create overlay network %v that already exists", n.id)
                }
        }

        // Make sure no rule is on the way from any stale secure network
        if !n.secure {
                for _, vni := range vnis {
                        d.programMangle(vni, false)
                        d.programInput(vni, false)
                }
        }

        if nInfo != nil {
                if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil {
                        return err
                }
        }

        return nil
}

func (d *driver) DeleteNetwork(nid string) error {
        if nid == "" {
                return errors.New("invalid network id")
        }

        // Make sure driver resources are initialized before proceeding
        if err := d.configure(); err != nil {
                return err
        }

        n, unlock, err := d.lockNetwork(nid)
        if err != nil {
                return err
        }
        // Unlock the network even if it's going to become garbage as another
        // goroutine could be blocked waiting for the lock, such as in
        // (*driver).lockNetwork.
        defer unlock()

        for _, ep := range n.endpoints {
                if ep.ifName != "" {
                        if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil {
                                if err := ns.NlHandle().LinkDel(link); err != nil {
                                        log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id)
                                }
                        }
                }
        }

        if n.secure {
                for _, s := range n.subnets {
                        if err := d.programMangle(s.vni, false); err != nil {
                                log.G(context.TODO()).WithFields(log.Fields{
                                        "error":      err,
                                        "network_id": n.id,
                                        "subnet":     s.subnetIP,
                                }).Warn("Failed to clean up iptables rules during overlay network deletion")
                        }
                        if err := d.programInput(s.vni, false); err != nil {
                                log.G(context.TODO()).WithFields(log.Fields{
                                        "error":      err,
                                        "network_id": n.id,
                                        "subnet":     s.subnetIP,
                                }).Warn("Failed to clean up iptables rules during overlay network deletion")
                        }
                }
        }

        d.mu.Lock()
        delete(d.networks, nid)
        d.mu.Unlock()

        return nil
}

func (n *network) joinSandbox(s *subnet, incJoinCount bool) error {
        // If there is a race between two go routines here only one will win
        // the other will wait.
        networkOnce.Do(populateVNITbl)

        var initialized bool

        if !n.sboxInit {
                n.initErr = n.initSandbox()
                initialized = n.initErr == nil
                // If there was an error, we cannot recover it
                n.sboxInit = true
        }

        if n.initErr != nil {
                return fmt.Errorf("network sandbox join failed: %v", n.initErr)
        }

        subnetErr := s.initErr
        if !s.sboxInit {
                subnetErr = n.initSubnetSandbox(s)
                // We can recover from these errors
                if subnetErr == nil {
                        s.initErr = subnetErr
                        s.sboxInit = true
                }
        }
        if subnetErr != nil {
                return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
        }

        if incJoinCount {
                n.joinCnt++
        }

        if initialized {
                if err := n.initSandboxPeerDB(); err != nil {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "nid":   n.id,
                                "error": err,
                        }).Warn("failed to initialize network peer database")
                }
        }

        return nil
}

func (n *network) leaveSandbox() {
        n.joinCnt--
        if n.joinCnt != 0 {
                return
        }

        n.destroySandbox()

        n.sboxInit = false
        n.initErr = nil
        for _, s := range n.subnets {
                s.sboxInit = false
                s.initErr = nil
        }
}

// to be called while holding network lock
func (n *network) destroySandbox() {
        if n.sbox != nil {
                for _, iface := range n.sbox.Interfaces() {
                        if err := iface.Remove(); err != nil {
                                log.G(context.TODO()).Debugf("Remove interface %s failed: %v", iface.SrcName(), err)
                        }
                }

                for _, s := range n.subnets {
                        if s.vxlanName != "" {
                                err := deleteInterface(s.vxlanName)
                                if err != nil {
                                        log.G(context.TODO()).Warnf("could not cleanup sandbox properly: %v", err)
                                }
                        }
                }

                n.sbox.Destroy()
                n.sbox = nil
        }
}

func populateVNITbl() {
        filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
                // NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument
                // That seems wrong... however I'm not familiar with this code or if that error matters
                func(path string, _ os.DirEntry, _ error) error {
                        _, fname := filepath.Split(path)

                        if len(strings.Split(fname, "-")) <= 1 {
                                return nil
                        }

                        n, err := netns.GetFromPath(path)
                        if err != nil {
                                log.G(context.TODO()).Errorf("Could not open namespace path %s during vni population: %v", path, err)
                                return nil
                        }
                        defer n.Close()

                        nlh, err := nlwrap.NewHandleAt(n, unix.NETLINK_ROUTE)
                        if err != nil {
                                log.G(context.TODO()).Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err)
                                return nil
                        }
                        defer nlh.Close()

                        err = nlh.SetSocketTimeout(soTimeout)
                        if err != nil {
                                log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err)
                        }

                        links, err := nlh.LinkList()
                        if err != nil {
                                log.G(context.TODO()).Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err)
                                return nil
                        }

                        for _, l := range links {
                                if l.Type() == "vxlan" {
                                        vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path
                                }
                        }

                        return nil
                })
}

func (n *network) generateVxlanName(s *subnet) string {
        id := n.id
        if len(n.id) > 5 {
                id = n.id[:5]
        }

        return fmt.Sprintf("vx-%06x-%v", s.vni, id)
}

func (n *network) generateBridgeName(s *subnet) string {
        id := n.id
        if len(n.id) > 5 {
                id = n.id[:5]
        }

        return n.getBridgeNamePrefix(s) + "-" + id
}

func (n *network) getBridgeNamePrefix(s *subnet) string {
        return fmt.Sprintf("ov-%06x", s.vni)
}

func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error {
        // Try to find this subnet's vni is being used in some
        // other namespace by looking at vniTbl that we just
        // populated in the once init. If a hit is found then
        // it must a stale namespace from previous
        // life. Destroy it completely and reclaim resourced.
        networkMu.Lock()
        path, ok := vniTbl[s.vni]
        networkMu.Unlock()

        if ok {
                deleteVxlanByVNI(path, s.vni)
                if err := unix.Unmount(path, unix.MNT_FORCE); err != nil {
                        log.G(context.TODO()).Errorf("unmount of %s failed: %v", path, err)
                }
                os.Remove(path)

                networkMu.Lock()
                delete(vniTbl, s.vni)
                networkMu.Unlock()
        }

        // create a bridge and vxlan device for this subnet and move it to the sandbox
        sbox := n.sbox

        if err := sbox.AddInterface(context.TODO(), brName, "br", "", osl.WithIPv4Address(netiputil.ToIPNet(s.gwIP)), osl.WithIsBridge(true)); err != nil {
                return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
        }

        v6transport, err := n.driver.isIPv6Transport()
        if err != nil {
                log.G(context.TODO()).WithError(err).Errorf("Assuming IPv4 transport; overlay network %s will not pass traffic if the Swarm data plane is IPv6.", n.id)
        }
        if err := createVxlan(vxlanName, s.vni, n.maxMTU(), v6transport); err != nil {
                return err
        }

        if err := sbox.AddInterface(context.TODO(), vxlanName, "vxlan", "", osl.WithMaster(brName)); err != nil {
                // If adding vxlan device to the overlay namespace fails, remove the bridge interface we
                // already added to the namespace. This allows the caller to try the setup again.
                for _, iface := range sbox.Interfaces() {
                        if iface.SrcName() == brName {
                                if ierr := iface.Remove(); ierr != nil {
                                        log.G(context.TODO()).Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
                                }
                        }
                }

                // Also, delete the vxlan interface. Since a global vni id is associated
                // with the vxlan interface, an orphaned vxlan interface will result in
                // failure of vxlan device creation if the vni is assigned to some other
                // network.
                if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
                        log.G(context.TODO()).Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
                }
                return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
        }

        if err := setDefaultVLAN(sbox); err != nil {
                // not a fatal error
                log.G(context.TODO()).WithError(err).Error("set bridge default vlan failed")
        }
        return nil
}

func setDefaultVLAN(ns *osl.Namespace) error {
        var brName string
        for _, i := range ns.Interfaces() {
                if i.Bridge() {
                        brName = i.DstName()
                }
        }

        // IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for
        // setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30).
        var innerErr error
        err := ns.InvokeFunc(func() {
                // Contrary to what the sysfs(5) man page says, the entries of /sys/class/net
                // represent the networking devices visible in the network namespace of the
                // process which mounted the sysfs filesystem, irrespective of the network
                // namespace of the process accessing the directory. Remount sysfs in order to
                // see the network devices in sbox's network namespace, making sure the mount
                // doesn't propagate back.
                //
                // The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a
                // dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot
                // be reverted so the thread needs to be terminated once the goroutine is
                // finished.
                runtime.LockOSThread()
                if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
                        innerErr = os.NewSyscallError("unshare", err)
                        return
                }
                if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
                        innerErr = &os.PathError{Op: "mount", Path: "/", Err: err}
                        return
                }
                if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
                        innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err}
                        return
                }

                path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid")
                data := []byte{'0', '\n'}

                if err := os.WriteFile(path, data, 0o644); err != nil {
                        innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err)
                        return
                }
        })
        if err != nil {
                return err
        }
        return innerErr
}

// Must be called with the network lock
func (n *network) initSubnetSandbox(s *subnet) error {
        brName := n.generateBridgeName(s)
        vxlanName := n.generateVxlanName(s)

        // Program iptables rules for mandatory encryption of the secure
        // network, or clean up leftover rules for a stale secure network which
        // was previously assigned the same VNI.
        if err := n.driver.programMangle(s.vni, n.secure); err != nil {
                return err
        }
        if err := n.driver.programInput(s.vni, n.secure); err != nil {
                if n.secure {
                        return multierror.Append(err, n.driver.programMangle(s.vni, false))
                }
        }

        if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil {
                return err
        }

        s.vxlanName = vxlanName
        s.brName = brName

        return nil
}

func (n *network) cleanupStaleSandboxes() {
        filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
                func(path string, _ os.DirEntry, _ error) error {
                        _, fname := filepath.Split(path)

                        pList := strings.Split(fname, "-")
                        if len(pList) <= 1 {
                                return nil
                        }

                        pattern := pList[1]
                        if strings.Contains(n.id, pattern) {
                                // Delete all vnis
                                deleteVxlanByVNI(path, 0)
                                unix.Unmount(path, unix.MNT_DETACH)
                                os.Remove(path)

                                // Now that we have destroyed this
                                // sandbox, remove all references to
                                // it in vniTbl so that we don't
                                // inadvertently destroy the sandbox
                                // created in this life.
                                networkMu.Lock()
                                for vni, tblPath := range vniTbl {
                                        if tblPath == path {
                                                delete(vniTbl, vni)
                                        }
                                }
                                networkMu.Unlock()
                        }

                        return nil
                })
}

func (n *network) initSandbox() error {
        n.initEpoch++

        // If there are any stale sandboxes related to this network
        // from previous daemon life clean it up here
        n.cleanupStaleSandboxes()

        key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id)
        sbox, err := osl.NewSandbox(key, true, false)
        if err != nil {
                return fmt.Errorf("could not get network sandbox: %v", err)
        }

        // this is needed to let the peerAdd configure the sandbox
        n.sbox = sbox
        n.fdbCnt = countmap.Map[ipmac]{}

        return nil
}

// lockNetwork returns the network object for nid, locked for exclusive access.
//
// It is the caller's responsibility to release the network lock by calling the
// returned unlock function.
func (d *driver) lockNetwork(nid string) (n *network, unlock func(), err error) {
        d.mu.Lock()
        n = d.networks[nid]
        d.mu.Unlock()
        for {
                if n == nil {
                        return nil, nil, fmt.Errorf("network %q not found", nid)
                }
                // We can't lock the network object while holding the driver
                // lock or we risk a lock order reversal deadlock.
                n.mu.Lock()
                // d.networks[nid] might have been replaced or removed after we
                // unlocked the driver lock. Double-check that the network we
                // just locked is the active network object for the nid.
                d.mu.Lock()
                n2 := d.networks[nid]
                d.mu.Unlock()
                if n2 == n {
                        return n, n.mu.Unlock, nil
                }
                // We locked a garbage object. Spin until the network we locked
                // matches up with the one present in the table.
                n.mu.Unlock()
                n = n2
        }
}

// getSubnetforIP returns the subnet to which the given IP belongs
func (n *network) getSubnetforIP(ip netip.Prefix) *subnet {
        for _, s := range n.subnets {
                // first check if the mask lengths are the same
                if s.subnetIP.Bits() != ip.Bits() {
                        continue
                }
                if s.subnetIP.Contains(ip.Addr()) {
                        return s
                }
        }
        return nil
}

//go:build linux

package overlay

import (
        "context"
        "errors"
        "fmt"
        "net"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/overlay/overlayutils"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/vishvananda/netlink"
        "github.com/vishvananda/netns"
)

var soTimeout = ns.NetlinkSocketsTimeout

func validateID(nid, eid string) error {
        if nid == "" {
                return errors.New("invalid network id")
        }

        if eid == "" {
                return errors.New("invalid endpoint id")
        }

        return nil
}

func createVethPair() (string, string, error) {
        nlh := ns.NlHandle()

        // Generate a name for what will be the host side pipe interface
        name1, err := netutils.GenerateIfaceName(nlh, vethPrefix, vethLen)
        if err != nil {
                return "", "", fmt.Errorf("error generating veth name1: %v", err)
        }

        // Generate a name for what will be the sandbox side pipe interface
        name2, err := netutils.GenerateIfaceName(nlh, vethPrefix, vethLen)
        if err != nil {
                return "", "", fmt.Errorf("error generating veth name2: %v", err)
        }

        // Generate and add the interface pipe host <-> sandbox
        veth := &netlink.Veth{
                LinkAttrs: netlink.LinkAttrs{Name: name1, TxQLen: 0},
                PeerName:  name2,
        }
        if err := nlh.LinkAdd(veth); err != nil {
                return "", "", fmt.Errorf("error creating veth pair: %v", err)
        }

        return name1, name2, nil
}

func createVxlan(name string, vni uint32, mtu int, vtepIPv6 bool) error {
        vxlan := &netlink.Vxlan{
                LinkAttrs: netlink.LinkAttrs{Name: name, MTU: mtu},
                VxlanId:   int(vni),
                Learning:  true,
                Port:      int(overlayutils.VXLANUDPPort()),
                Proxy:     true,
                L3miss:    true,
                L2miss:    true,
        }

        // The kernel restricts the destination VTEP (virtual tunnel endpoint) in
        // VXLAN forwarding database entries to a single address family, defaulting
        // to IPv4 unless either an IPv6 group or default remote destination address
        // is configured when the VXLAN link is created.
        //
        // Set up the VXLAN link for IPv6 destination addresses by setting the VXLAN
        // group address to the IPv6 unspecified address, like iproute2.
        // https://github.com/iproute2/iproute2/commit/97d564b90ccb1e4a3c756d9caae161f55b2b63a2
        // https://patchwork.ozlabs.org/project/netdev/patch/20180917171325.GA2660@localhost.localdomain/
        if vtepIPv6 {
                vxlan.Group = net.IPv6unspecified
        }

        if err := ns.NlHandle().LinkAdd(vxlan); err != nil {
                return fmt.Errorf("error creating vxlan interface: %v", err)
        }

        return nil
}

func deleteInterface(name string) error {
        link, err := ns.NlHandle().LinkByName(name)
        if err != nil {
                return fmt.Errorf("failed to find interface with name %s: %v", name, err)
        }

        if err := ns.NlHandle().LinkDel(link); err != nil {
                return fmt.Errorf("error deleting interface with name %s: %v", name, err)
        }

        return nil
}

func deleteVxlanByVNI(path string, vni uint32) error {
        nlh := ns.NlHandle()
        if path != "" {
                ns, err := netns.GetFromPath(path)
                if err != nil {
                        return fmt.Errorf("failed to get ns handle for %s: %v", path, err)
                }
                defer ns.Close()

                nlh, err = nlwrap.NewHandleAt(ns, syscall.NETLINK_ROUTE)
                if err != nil {
                        return fmt.Errorf("failed to get netlink handle for ns %s: %v", path, err)
                }
                defer nlh.Close()
                err = nlh.SetSocketTimeout(soTimeout)
                if err != nil {
                        log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vxlan deletion: %v", err)
                }
        }

        links, err := nlh.LinkList()
        if err != nil {
                return fmt.Errorf("failed to list interfaces while deleting vxlan interface by vni: %v", err)
        }

        for _, l := range links {
                if l.Type() == "vxlan" && (vni == 0 || l.(*netlink.Vxlan).VxlanId == int(vni)) {
                        err = nlh.LinkDel(l)
                        if err != nil {
                                return fmt.Errorf("error deleting vxlan interface with id %d: %v", vni, err)
                        }
                        return nil
                }
        }

        return fmt.Errorf("could not find a vxlan interface to delete with id %d", vni)
}

//go:build linux

package overlay

//go:generate protoc -I=. -I=../../../../vendor/ --gogofaster_out=import_path=github.com/docker/docker/daemon/libnetwork/drivers/overlay:. overlay.proto

import (
        "context"
        "errors"
        "fmt"
        "net/netip"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/discoverapi"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/scope"
)

const (
        NetworkType  = "overlay"
        vethPrefix   = "veth"
        vethLen      = len(vethPrefix) + 7
        vxlanEncap   = 50
        secureOption = "encrypted"
)

// overlay driver must implement the discover-API.
var _ discoverapi.Discover = (*driver)(nil)

type driver struct {
        // Immutable; mu does not need to be held when accessing these fields.

        config map[string]interface{}
        initOS sync.Once

        // encrMu guards secMap and keys,
        // and synchronizes the application of encryption parameters
        // to the kernel.
        //
        // This mutex is above mu in the lock hierarchy.
        // Do not lock any locks aside from mu while holding encrMu.
        encrMu sync.Mutex
        secMap encrMap
        keys   []*key

        // mu must be held when accessing the fields which follow it
        // in the struct definition.
        //
        // This mutex is at the bottom of the lock hierarchy:
        // do not lock any other locks while holding it.
        mu               sync.Mutex
        bindAddress      netip.Addr
        advertiseAddress netip.Addr
        networks         networkTable
}

// Register registers a new instance of the overlay driver.
func Register(r driverapi.Registerer, config map[string]interface{}) error {
        d := &driver{
                networks: networkTable{},
                secMap:   encrMap{},
                config:   config,
        }
        return r.RegisterDriver(NetworkType, d, driverapi.Capability{
                DataScope:         scope.Global,
                ConnectivityScope: scope.Global,
        })
}

func (d *driver) configure() error {
        // Apply OS specific kernel configs if needed
        d.initOS.Do(applyOStweaks)

        return nil
}

func (d *driver) Type() string {
        return NetworkType
}

func (d *driver) IsBuiltIn() bool {
        return true
}

// isIPv6Transport reports whether the outer Layer-3 transport for VXLAN datagrams is IPv6.
func (d *driver) isIPv6Transport() (bool, error) {
        // Infer whether remote peers' virtual tunnel endpoints will be IPv4 or IPv6
        // from the address family of our own advertise address. This is a
        // reasonable inference to make as Linux VXLAN links do not support
        // mixed-address-family remote peers.
        if !d.advertiseAddress.IsValid() {
                return false, errors.New("overlay: cannot determine address family of transport: the local data-plane address is not currently known")
        }
        return d.advertiseAddress.Is6(), nil
}

func (d *driver) nodeJoin(data discoverapi.NodeDiscoveryData) error {
        if data.Self {
                advAddr, _ := netip.ParseAddr(data.Address)
                bindAddr, _ := netip.ParseAddr(data.BindAddress)
                if !advAddr.IsValid() {
                        return errors.New("invalid discovery data")
                }
                d.mu.Lock()
                d.advertiseAddress = advAddr
                d.bindAddress = bindAddr
                d.mu.Unlock()
        }
        return nil
}

// DiscoverNew is a notification for a new discovery event, such as a new node joining a cluster
func (d *driver) DiscoverNew(dType discoverapi.DiscoveryType, data interface{}) error {
        switch dType {
        case discoverapi.NodeDiscovery:
                nodeData, ok := data.(discoverapi.NodeDiscoveryData)
                if !ok {
                        return fmt.Errorf("invalid discovery data type: %T", data)
                }
                return d.nodeJoin(nodeData)
        case discoverapi.EncryptionKeysConfig:
                encrData, ok := data.(discoverapi.DriverEncryptionConfig)
                if !ok {
                        return errors.New("invalid encryption key notification data")
                }
                keys := make([]*key, 0, len(encrData.Keys))
                for i := 0; i < len(encrData.Keys); i++ {
                        k := &key{
                                value: encrData.Keys[i],
                                tag:   uint32(encrData.Tags[i]),
                        }
                        keys = append(keys, k)
                }
                if err := d.setKeys(keys); err != nil {
                        log.G(context.TODO()).Warn(err)
                }
        case discoverapi.EncryptionKeysUpdate:
                var newKey, delKey, priKey *key
                encrData, ok := data.(discoverapi.DriverEncryptionUpdate)
                if !ok {
                        return errors.New("invalid encryption key notification data")
                }
                if encrData.Key != nil {
                        newKey = &key{
                                value: encrData.Key,
                                tag:   uint32(encrData.Tag),
                        }
                }
                if encrData.Primary != nil {
                        priKey = &key{
                                value: encrData.Primary,
                                tag:   uint32(encrData.PrimaryTag),
                        }
                }
                if encrData.Prune != nil {
                        delKey = &key{
                                value: encrData.Prune,
                                tag:   uint32(encrData.PruneTag),
                        }
                }
                if err := d.updateKeys(newKey, priKey, delKey); err != nil {
                        return err
                }
        default:
        }
        return nil
}

// DiscoverDelete is a notification for a discovery delete event, such as a node leaving a cluster
func (d *driver) DiscoverDelete(dType discoverapi.DiscoveryType, data interface{}) error {
        return nil
}

// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: overlay.proto

package overlay

import (
        fmt "fmt"
        _ "github.com/gogo/protobuf/gogoproto"
        proto "github.com/gogo/protobuf/proto"
        io "io"
        math "math"
        math_bits "math/bits"
        reflect "reflect"
        strings "strings"
)

// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf

// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package

// PeerRecord defines the information corresponding to a peer
// container in the overlay network.
type PeerRecord struct {
        // Endpoint IP is the IP of the container attachment on the
        // given overlay network.
        EndpointIP string `protobuf:"bytes,1,opt,name=endpoint_ip,json=endpointIp,proto3" json:"endpoint_ip,omitempty"`
        // Endpoint MAC is the mac address of the container attachment
        // on the given overlay network.
        EndpointMAC string `protobuf:"bytes,2,opt,name=endpoint_mac,json=endpointMac,proto3" json:"endpoint_mac,omitempty"`
        // Tunnel Endpoint IP defines the host IP for the host in
        // which this container is running and can be reached by
        // building a tunnel to that host IP.
        TunnelEndpointIP string `protobuf:"bytes,3,opt,name=tunnel_endpoint_ip,json=tunnelEndpointIp,proto3" json:"tunnel_endpoint_ip,omitempty"`
}

func (m *PeerRecord) Reset()      { *m = PeerRecord{} }
func (*PeerRecord) ProtoMessage() {}
func (*PeerRecord) Descriptor() ([]byte, []int) {
        return fileDescriptor_61fc82527fbe24ad, []int{0}
}
func (m *PeerRecord) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *PeerRecord) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_PeerRecord.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *PeerRecord) XXX_Merge(src proto.Message) {
        xxx_messageInfo_PeerRecord.Merge(m, src)
}
func (m *PeerRecord) XXX_Size() int {
        return m.Size()
}
func (m *PeerRecord) XXX_DiscardUnknown() {
        xxx_messageInfo_PeerRecord.DiscardUnknown(m)
}

var xxx_messageInfo_PeerRecord proto.InternalMessageInfo

func (m *PeerRecord) GetEndpointIP() string {
        if m != nil {
                return m.EndpointIP
        }
        return ""
}

func (m *PeerRecord) GetEndpointMAC() string {
        if m != nil {
                return m.EndpointMAC
        }
        return ""
}

func (m *PeerRecord) GetTunnelEndpointIP() string {
        if m != nil {
                return m.TunnelEndpointIP
        }
        return ""
}

func init() {
        proto.RegisterType((*PeerRecord)(nil), "overlay.PeerRecord")
}

func init() { proto.RegisterFile("overlay.proto", fileDescriptor_61fc82527fbe24ad) }

var fileDescriptor_61fc82527fbe24ad = []byte{
        // 233 bytes of a gzipped FileDescriptorProto
        0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0xe2, 0xcd, 0x2f, 0x4b, 0x2d,
        0xca, 0x49, 0xac, 0xd4, 0x2b, 0x28, 0xca, 0x2f, 0xc9, 0x17, 0x62, 0x87, 0x72, 0xa5, 0x74, 0xd3,
        0x33, 0x4b, 0x32, 0x4a, 0x93, 0xf4, 0x92, 0xf3, 0x73, 0xf5, 0xd3, 0xf3, 0xd3, 0xf3, 0xf5, 0xc1,
        0xf2, 0x49, 0xa5, 0x69, 0x60, 0x1e, 0x98, 0x03, 0x66, 0x41, 0xf4, 0x29, 0x6d, 0x65, 0xe4, 0xe2,
        0x0a, 0x48, 0x4d, 0x2d, 0x0a, 0x4a, 0x4d, 0xce, 0x2f, 0x4a, 0x11, 0xd2, 0xe7, 0xe2, 0x4e, 0xcd,
        0x4b, 0x29, 0xc8, 0xcf, 0xcc, 0x2b, 0x89, 0xcf, 0x2c, 0x90, 0x60, 0x54, 0x60, 0xd4, 0xe0, 0x74,
        0xe2, 0x7b, 0x74, 0x4f, 0x9e, 0xcb, 0x15, 0x2a, 0xec, 0x19, 0x10, 0xc4, 0x05, 0x53, 0xe2, 0x59,
        0x20, 0x64, 0xc4, 0xc5, 0x03, 0xd7, 0x90, 0x9b, 0x98, 0x2c, 0xc1, 0x04, 0xd6, 0xc1, 0xff, 0xe8,
        0x9e, 0x3c, 0x37, 0x4c, 0x87, 0xaf, 0xa3, 0x73, 0x10, 0xdc, 0x54, 0xdf, 0xc4, 0x64, 0x21, 0x27,
        0x2e, 0xa1, 0x92, 0xd2, 0xbc, 0xbc, 0xd4, 0x9c, 0x78, 0x64, 0xbb, 0x98, 0xc1, 0x3a, 0x45, 0x1e,
        0xdd, 0x93, 0x17, 0x08, 0x01, 0xcb, 0x22, 0xd9, 0x28, 0x50, 0x82, 0x2a, 0x52, 0xe0, 0xa4, 0x72,
        0xe3, 0xa1, 0x1c, 0xc3, 0x87, 0x87, 0x72, 0x8c, 0x0d, 0x8f, 0xe4, 0x18, 0x4f, 0x3c, 0x92, 0x63,
        0xbc, 0xf0, 0x48, 0x8e, 0xf1, 0xc1, 0x23, 0x39, 0xc6, 0x09, 0x8f, 0xe5, 0x18, 0x2e, 0x3c, 0x96,
        0x63, 0xb8, 0xf1, 0x58, 0x8e, 0x21, 0x89, 0x0d, 0xec, 0x49, 0x63, 0x40, 0x00, 0x00, 0x00, 0xff,
        0xff, 0xd4, 0x37, 0x59, 0xc8, 0x2d, 0x01, 0x00, 0x00,
}

func (this *PeerRecord) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 7)
        s = append(s, "&overlay.PeerRecord{")
        s = append(s, "EndpointIP: "+fmt.Sprintf("%#v", this.EndpointIP)+",\n")
        s = append(s, "EndpointMAC: "+fmt.Sprintf("%#v", this.EndpointMAC)+",\n")
        s = append(s, "TunnelEndpointIP: "+fmt.Sprintf("%#v", this.TunnelEndpointIP)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func valueToGoStringOverlay(v interface{}, typ string) string {
        rv := reflect.ValueOf(v)
        if rv.IsNil() {
                return "nil"
        }
        pv := reflect.Indirect(rv).Interface()
        return fmt.Sprintf("func(v %v) *%v { return &v } ( %#v )", typ, typ, pv)
}
func (m *PeerRecord) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *PeerRecord) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *PeerRecord) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.TunnelEndpointIP) > 0 {
                i -= len(m.TunnelEndpointIP)
                copy(dAtA[i:], m.TunnelEndpointIP)
                i = encodeVarintOverlay(dAtA, i, uint64(len(m.TunnelEndpointIP)))
                i--
                dAtA[i] = 0x1a
        }
        if len(m.EndpointMAC) > 0 {
                i -= len(m.EndpointMAC)
                copy(dAtA[i:], m.EndpointMAC)
                i = encodeVarintOverlay(dAtA, i, uint64(len(m.EndpointMAC)))
                i--
                dAtA[i] = 0x12
        }
        if len(m.EndpointIP) > 0 {
                i -= len(m.EndpointIP)
                copy(dAtA[i:], m.EndpointIP)
                i = encodeVarintOverlay(dAtA, i, uint64(len(m.EndpointIP)))
                i--
                dAtA[i] = 0xa
        }
        return len(dAtA) - i, nil
}

func encodeVarintOverlay(dAtA []byte, offset int, v uint64) int {
        offset -= sovOverlay(v)
        base := offset
        for v >= 1<<7 {
                dAtA[offset] = uint8(v&0x7f | 0x80)
                v >>= 7
                offset++
        }
        dAtA[offset] = uint8(v)
        return base
}
func (m *PeerRecord) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.EndpointIP)
        if l > 0 {
                n += 1 + l + sovOverlay(uint64(l))
        }
        l = len(m.EndpointMAC)
        if l > 0 {
                n += 1 + l + sovOverlay(uint64(l))
        }
        l = len(m.TunnelEndpointIP)
        if l > 0 {
                n += 1 + l + sovOverlay(uint64(l))
        }
        return n
}

func sovOverlay(x uint64) (n int) {
        return (math_bits.Len64(x|1) + 6) / 7
}
func sozOverlay(x uint64) (n int) {
        return sovOverlay(uint64((x << 1) ^ uint64((int64(x) >> 63))))
}
func (this *PeerRecord) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&PeerRecord{`,
                `EndpointIP:` + fmt.Sprintf("%v", this.EndpointIP) + `,`,
                `EndpointMAC:` + fmt.Sprintf("%v", this.EndpointMAC) + `,`,
                `TunnelEndpointIP:` + fmt.Sprintf("%v", this.TunnelEndpointIP) + `,`,
                `}`,
        }, "")
        return s
}
func valueToStringOverlay(v interface{}) string {
        rv := reflect.ValueOf(v)
        if rv.IsNil() {
                return "nil"
        }
        pv := reflect.Indirect(rv).Interface()
        return fmt.Sprintf("*%v", pv)
}
func (m *PeerRecord) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowOverlay
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: PeerRecord: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: PeerRecord: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field EndpointIP", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowOverlay
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthOverlay
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthOverlay
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.EndpointIP = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field EndpointMAC", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowOverlay
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthOverlay
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthOverlay
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.EndpointMAC = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field TunnelEndpointIP", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowOverlay
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthOverlay
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthOverlay
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.TunnelEndpointIP = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipOverlay(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthOverlay
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func skipOverlay(dAtA []byte) (n int, err error) {
        l := len(dAtA)
        iNdEx := 0
        depth := 0
        for iNdEx < l {
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return 0, ErrIntOverflowOverlay
                        }
                        if iNdEx >= l {
                                return 0, io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= (uint64(b) & 0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                wireType := int(wire & 0x7)
                switch wireType {
                case 0:
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowOverlay
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                iNdEx++
                                if dAtA[iNdEx-1] < 0x80 {
                                        break
                                }
                        }
                case 1:
                        iNdEx += 8
                case 2:
                        var length int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowOverlay
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                length |= (int(b) & 0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if length < 0 {
                                return 0, ErrInvalidLengthOverlay
                        }
                        iNdEx += length
                case 3:
                        depth++
                case 4:
                        if depth == 0 {
                                return 0, ErrUnexpectedEndOfGroupOverlay
                        }
                        depth--
                case 5:
                        iNdEx += 4
                default:
                        return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
                }
                if iNdEx < 0 {
                        return 0, ErrInvalidLengthOverlay
                }
                if depth == 0 {
                        return iNdEx, nil
                }
        }
        return 0, io.ErrUnexpectedEOF
}

var (
        ErrInvalidLengthOverlay        = fmt.Errorf("proto: negative length found during unmarshaling")
        ErrIntOverflowOverlay          = fmt.Errorf("proto: integer overflow")
        ErrUnexpectedEndOfGroupOverlay = fmt.Errorf("proto: unexpected end of group")
)

// Package overlayutils provides utility functions for overlay networks
package overlayutils

import (
        "fmt"
        "strconv"
        "strings"
        "sync"
)

var (
        mutex        sync.RWMutex
        vxlanUDPPort = defaultVXLANUDPPort
)

const defaultVXLANUDPPort uint32 = 4789

// ConfigVXLANUDPPort configures the VXLAN UDP port (data path port) number.
// If no port is set, the default (4789) is returned. Valid port numbers are
// between 1024 and 49151.
func ConfigVXLANUDPPort(vxlanPort uint32) error {
        if vxlanPort == 0 {
                vxlanPort = defaultVXLANUDPPort
        }
        // IANA procedures for each range in detail
        // The Well Known Ports, aka the System Ports, from 0-1023
        // The Registered Ports, aka the User Ports, from 1024-49151
        // The Dynamic Ports, aka the Private Ports, from 49152-65535
        // So we can allow range between 1024 to 49151
        if vxlanPort < 1024 || vxlanPort > 49151 {
                return fmt.Errorf("VXLAN UDP port number is not in valid range (1024-49151): %d", vxlanPort)
        }
        mutex.Lock()
        vxlanUDPPort = vxlanPort
        mutex.Unlock()
        return nil
}

// VXLANUDPPort returns Vxlan UDP port number
func VXLANUDPPort() uint32 {
        mutex.RLock()
        defer mutex.RUnlock()
        return vxlanUDPPort
}

// AppendVNIList appends the VNI values encoded as a CSV string to slice.
func AppendVNIList(vnis []uint32, csv string) ([]uint32, error) {
        for {
                var (
                        vniStr string
                        found  bool
                )
                vniStr, csv, found = strings.Cut(csv, ",")
                vni, err := strconv.ParseUint(vniStr, 10, 32)
                if err != nil {
                        return vnis, fmt.Errorf("invalid vxlan id value %q passed", vniStr)
                }

                vnis = append(vnis, uint32(vni))
                if !found {
                        return vnis, nil
                }
        }
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23 && linux

package overlay

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/netip"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/setmatrix"
        "github.com/docker/docker/daemon/libnetwork/osl"
)

const ovPeerTable = "overlay_peer_table"

type peerEntry struct {
        eid  string
        mac  macAddr
        vtep netip.Addr
}

func (p *peerEntry) isLocal() bool {
        return !p.vtep.IsValid()
}

type peerMap struct {
        mp setmatrix.SetMatrix[netip.Prefix, peerEntry]
}

func (pm *peerMap) Walk(f func(netip.Prefix, peerEntry)) {
        for _, peerAddr := range pm.mp.Keys() {
                entry, ok := pm.Get(peerAddr)
                if ok {
                        f(peerAddr, entry)
                }
        }
}

func (pm *peerMap) Get(peerIP netip.Prefix) (peerEntry, bool) {
        c, _ := pm.mp.Get(peerIP)
        if len(c) == 0 {
                return peerEntry{}, false
        }
        return c[0], true
}

func (pm *peerMap) Add(eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) (bool, int) {
        pEntry := peerEntry{
                eid:  eid,
                mac:  macAddrOf(peerMac),
                vtep: vtep,
        }
        b, i := pm.mp.Insert(peerIP, pEntry)
        if i != 1 {
                // Transient case, there is more than one endpoint that is using the same IP
                s, _ := pm.mp.String(peerIP)
                log.G(context.TODO()).Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", peerIP, i, s)
        }
        return b, i
}

func (pm *peerMap) Delete(eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) (bool, int) {
        pEntry := peerEntry{
                eid:  eid,
                mac:  macAddrOf(peerMac),
                vtep: vtep,
        }

        b, i := pm.mp.Remove(peerIP, pEntry)
        if i != 0 {
                // Transient case, there is more than one endpoint that is using the same IP
                s, _ := pm.mp.String(peerIP)
                log.G(context.TODO()).Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", peerIP, i, s)
        }
        return b, i
}

// The overlay uses a lazy initialization approach, this means that when a network is created
// and the driver registered the overlay does not allocate resources till the moment that a
// sandbox is actually created.
// At the moment of this call, that happens when a sandbox is initialized, is possible that
// networkDB has already delivered some events of peers already available on remote nodes,
// these peers are saved into the peerDB and this function is used to properly configure
// the network sandbox with all those peers that got previously notified.
//
// The caller is responsible for ensuring that peerAdd and peerDelete are not
// called concurrently with this function to guarantee consistency.
func (n *network) initSandboxPeerDB() error {
        var errs []error
        n.peerdb.Walk(func(peerIP netip.Prefix, pEntry peerEntry) {
                if !pEntry.isLocal() {
                        if err := n.addNeighbor(peerIP, pEntry.mac.HardwareAddr(), pEntry.vtep); err != nil {
                                errs = append(errs, fmt.Errorf("failed to add neighbor entries for %s: %w", peerIP, err))
                        }
                }
        })
        return errors.Join(errs...)
}

// peerAdd adds a new entry to the peer database.
//
// Local peers are signified by an invalid vtep (i.e. netip.Addr{}).
func (n *network) peerAdd(eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) error {
        if eid == "" {
                return errors.New("invalid endpoint id")
        }

        inserted, dbEntries := n.peerdb.Add(eid, peerIP, peerMac, vtep)
        if !inserted {
                log.G(context.TODO()).Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v vtep:%v",
                        n.id, eid, peerIP, peerMac, vtep)
        }
        if vtep.IsValid() {
                err := n.addNeighbor(peerIP, peerMac, vtep)
                if err != nil {
                        if dbEntries > 1 && errors.As(err, &osl.NeighborSearchError{}) {
                                // Conflicting neighbor entries are already programmed into the kernel and we are in the transient case.
                                // Upon deletion if the active configuration is deleted the next one from the database will be restored.
                                return nil
                        }
                        return fmt.Errorf("peer add operation failed: %w", err)
                }
        }
        return nil
}

// addNeighbor programs the kernel so the given peer is reachable through the VXLAN tunnel.
func (n *network) addNeighbor(peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) error {
        if n.sbox == nil {
                // We are hitting this case for all the events that are arriving before that the sandbox
                // is being created. The peer got already added into the database and the sandbox init will
                // call the peerDbUpdateSandbox that will configure all these peers from the database
                return nil
        }

        s := n.getSubnetforIP(peerIP)
        if s == nil {
                return fmt.Errorf("couldn't find the subnet %q in network %q", peerIP.String(), n.id)
        }

        if err := n.joinSandbox(s, false); err != nil {
                return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
        }

        if n.secure {
                if err := n.driver.setupEncryption(vtep); err != nil {
                        log.G(context.TODO()).Warn(err)
                }
        }

        // Add neighbor entry for the peer IP
        if err := n.sbox.AddNeighbor(peerIP.Addr().AsSlice(), peerMac, osl.WithLinkName(s.vxlanName)); err != nil {
                return fmt.Errorf("could not add neighbor entry into the sandbox: %w", err)
        }

        // Add fdb entry to the bridge for the peer mac
        if n.fdbCnt.Add(ipmacOf(vtep, peerMac), 1) == 1 {
                if err := n.sbox.AddNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil {
                        return fmt.Errorf("could not add fdb entry into the sandbox: %w", err)
                }
        }

        return nil
}

// peerDelete removes an entry from the peer database.
//
// Local peers are signified by an invalid vtep (i.e. netip.Addr{}).
func (n *network) peerDelete(eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) error {
        if eid == "" {
                return errors.New("invalid endpoint id")
        }

        logger := log.G(context.TODO()).WithFields(log.Fields{
                "nid":  n.id,
                "eid":  eid,
                "ip":   peerIP,
                "mac":  peerMac,
                "vtep": vtep,
        })
        deleted, dbEntries := n.peerdb.Delete(eid, peerIP, peerMac, vtep)
        if !deleted {
                logger.Warn("Peer entry was not in db")
        }
        if vtep.IsValid() {
                err := n.deleteNeighbor(peerIP, peerMac, vtep)
                if err != nil {
                        if dbEntries > 0 && errors.As(err, &osl.NeighborSearchError{}) {
                                // We fall in here if there is a transient state and if the neighbor that is being deleted
                                // was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
                                return nil
                        }
                        logger.WithError(err).Warn("Peer delete operation failed")
                }
        }

        if dbEntries > 0 {
                // If there is still an entry into the database and the deletion went through without errors means that there is now no
                // configuration active in the kernel.
                // Restore one configuration for the ip directly from the database, note that is guaranteed that there is one
                peerEntry, ok := n.peerdb.Get(peerIP)
                if !ok {
                        return fmt.Errorf("peerDelete: unable to restore a configuration: no entry for %v found in the database", peerIP)
                }
                err := n.addNeighbor(peerIP, peerEntry.mac.HardwareAddr(), peerEntry.vtep)
                if err != nil {
                        return fmt.Errorf("peer delete operation failed: %w", err)
                }
        }
        return nil
}

// deleteNeighbor removes programming from the kernel for the given peer to be
// reachable through the VXLAN tunnel. It is the inverse of [driver.addNeighbor].
func (n *network) deleteNeighbor(peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) error {
        if n.sbox == nil {
                return nil
        }

        if n.secure {
                if err := n.driver.removeEncryption(vtep); err != nil {
                        log.G(context.TODO()).Warn(err)
                }
        }

        s := n.getSubnetforIP(peerIP)
        if s == nil {
                return fmt.Errorf("could not find the subnet %q in network %q", peerIP.String(), n.id)
        }
        // Remove fdb entry to the bridge for the peer mac
        if n.fdbCnt.Add(ipmacOf(vtep, peerMac), -1) == 0 {
                if err := n.sbox.DeleteNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil {
                        return fmt.Errorf("could not delete fdb entry in the sandbox: %w", err)
                }
        }

        // Delete neighbor entry for the peer IP
        if err := n.sbox.DeleteNeighbor(peerIP.Addr().AsSlice(), peerMac, osl.WithLinkName(s.vxlanName)); err != nil {
                return fmt.Errorf("could not delete neighbor entry in the sandbox:%v", err)
        }

        return nil
}

package overlay

// Handy utility types for making unhashable values hashable.

import (
        "net"
        "net/netip"
)

// macAddr is a hashable encoding of a MAC address.
type macAddr uint64

// macAddrOf converts a net.HardwareAddr to a macAddr.
func macAddrOf(mac net.HardwareAddr) macAddr {
        if len(mac) != 6 {
                return 0
        }
        return macAddr(mac[0])<<40 | macAddr(mac[1])<<32 | macAddr(mac[2])<<24 |
                macAddr(mac[3])<<16 | macAddr(mac[4])<<8 | macAddr(mac[5])
}

// HardwareAddr converts a macAddr back to a net.HardwareAddr.
func (p macAddr) HardwareAddr() net.HardwareAddr {
        mac := [6]byte{
                byte(p >> 40), byte(p >> 32), byte(p >> 24),
                byte(p >> 16), byte(p >> 8), byte(p),
        }
        return mac[:]
}

// String returns p.HardwareAddr().String().
func (p macAddr) String() string {
        return p.HardwareAddr().String()
}

// ipmac is a hashable tuple of an IP address and a MAC address suitable for use as a map key.
type ipmac struct {
        ip  netip.Addr
        mac macAddr
}

// ipmacOf is a convenience constructor for creating an ipmac from a [net.HardwareAddr].
func ipmacOf(ip netip.Addr, mac net.HardwareAddr) ipmac {
        return ipmac{
                ip:  ip,
                mac: macAddrOf(mac),
        }
}

func (i ipmac) String() string {
        return i.ip.String() + " " + i.mac.String()
}

/*
Package api represents all requests and responses suitable for conversation
with a remote driver.
*/
package api

import (
        "net"

        "github.com/docker/docker/daemon/libnetwork/discoverapi"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
)

// Response is the basic response structure used in all responses.
type Response struct {
        Err string
}

// GetError returns the error from the response, if any.
func (r *Response) GetError() string {
        return r.Err
}

// GetCapabilityResponse is the response of GetCapability request
type GetCapabilityResponse struct {
        Response
        Scope             string
        ConnectivityScope string

        // GwAllocChecker is used by the driver to report that it will accept a
        // [GwAllocCheckerRequest] at "GwAllocCheck".
        GwAllocChecker bool
}

// AllocateNetworkRequest requests allocation of new network by manager
type AllocateNetworkRequest struct {
        // A network ID that remote plugins are expected to store for future
        // reference.
        NetworkID string

        // A free form map->object interface for communication of options.
        Options map[string]string

        // IPAMData contains the address pool information for this network
        IPv4Data, IPv6Data []driverapi.IPAMData
}

// AllocateNetworkResponse is the response to the AllocateNetworkRequest.
type AllocateNetworkResponse struct {
        Response
        // A free form plugin specific string->string object to be sent in
        // CreateNetworkRequest call in the libnetwork agents
        Options map[string]string
}

// FreeNetworkRequest is the request to free allocated network in the manager
type FreeNetworkRequest struct {
        // The ID of the network to be freed.
        NetworkID string
}

// FreeNetworkResponse is the response to a request for freeing a network.
type FreeNetworkResponse struct {
        Response
}

// GwAllocCheckerRequest is the body of a request sent to "GwAllocCheck", if the
// driver reported capability "GwAllocChecker". This request is sent before the
// [CreateNetworkRequest].
type GwAllocCheckerRequest struct {
        // Options has the same form as Options in [CreateNetworkRequest].
        Options map[string]interface{}
}

// GwAllocCheckerResponse is the response to a [GwAllocCheckerRequest].
type GwAllocCheckerResponse struct {
        Response
        // SkipIPv4, if true, tells Docker that when it creates a network with the
        // Options in the [GwAllocCheckerRequest] it should not reserve an IPv4
        // gateway address.
        SkipIPv4 bool
        // SkipIPv6, if true, tells Docker that when it creates a network with the
        // Options in the [GwAllocCheckerRequest] it should not reserve an IPv6
        // gateway address.
        SkipIPv6 bool
}

// CreateNetworkRequest requests a new network.
type CreateNetworkRequest struct {
        // A network ID that remote plugins are expected to store for future
        // reference.
        NetworkID string

        // A free form map->object interface for communication of options.
        Options map[string]interface{}

        // IPAMData contains the address pool information for this network
        IPv4Data, IPv6Data []driverapi.IPAMData
}

// CreateNetworkResponse is the response to the CreateNetworkRequest.
type CreateNetworkResponse struct {
        Response
}

// DeleteNetworkRequest is the request to delete an existing network.
type DeleteNetworkRequest struct {
        // The ID of the network to delete.
        NetworkID string
}

// DeleteNetworkResponse is the response to a request for deleting a network.
type DeleteNetworkResponse struct {
        Response
}

// CreateEndpointRequest is the request to create an endpoint within a network.
type CreateEndpointRequest struct {
        // Provided at create time, this will be the network id referenced.
        NetworkID string
        // The ID of the endpoint for later reference.
        EndpointID string
        Interface  *EndpointInterface
        Options    map[string]interface{}
}

// EndpointInterface represents an interface endpoint.
type EndpointInterface struct {
        Address     string
        AddressIPv6 string
        MacAddress  string
}

// CreateEndpointResponse is the response to the CreateEndpoint action.
type CreateEndpointResponse struct {
        Response
        Interface *EndpointInterface
}

// Interface is the representation of a linux interface.
type Interface struct {
        Address     *net.IPNet
        AddressIPv6 *net.IPNet
        MacAddress  net.HardwareAddr
}

// DeleteEndpointRequest describes the API for deleting an endpoint.
type DeleteEndpointRequest struct {
        NetworkID  string
        EndpointID string
}

// DeleteEndpointResponse is the response to the DeleteEndpoint action.
type DeleteEndpointResponse struct {
        Response
}

// EndpointInfoRequest retrieves information about the endpoint from the network driver.
type EndpointInfoRequest struct {
        NetworkID  string
        EndpointID string
}

// EndpointInfoResponse is the response to an EndpointInfoRequest.
type EndpointInfoResponse struct {
        Response
        Value map[string]interface{}
}

// JoinRequest describes the API for joining an endpoint to a sandbox.
type JoinRequest struct {
        NetworkID  string
        EndpointID string
        SandboxKey string
        Options    map[string]interface{}
}

// InterfaceName is the struct representation of a pair of devices with source
// and destination, for the purposes of putting an endpoint into a container.
type InterfaceName struct {
        SrcName   string
        DstName   string
        DstPrefix string
}

// StaticRoute is the plain JSON representation of a static route.
type StaticRoute struct {
        Destination string
        RouteType   int
        NextHop     string
}

// JoinResponse is the response to a JoinRequest.
type JoinResponse struct {
        Response
        InterfaceName         *InterfaceName
        Gateway               string
        GatewayIPv6           string
        StaticRoutes          []StaticRoute
        DisableGatewayService bool
}

// LeaveRequest describes the API for detaching an endpoint from a sandbox.
type LeaveRequest struct {
        NetworkID  string
        EndpointID string
}

// LeaveResponse is the answer to LeaveRequest.
type LeaveResponse struct {
        Response
}

// ProgramExternalConnectivityRequest describes the API for programming the external connectivity for the given endpoint.
type ProgramExternalConnectivityRequest struct {
        NetworkID  string
        EndpointID string
        Options    map[string]interface{}
}

// ProgramExternalConnectivityResponse is the answer to ProgramExternalConnectivityRequest.
type ProgramExternalConnectivityResponse struct {
        Response
}

// RevokeExternalConnectivityRequest describes the API for revoking the external connectivity for the given endpoint.
type RevokeExternalConnectivityRequest struct {
        NetworkID  string
        EndpointID string
}

// RevokeExternalConnectivityResponse is the answer to RevokeExternalConnectivityRequest.
type RevokeExternalConnectivityResponse struct {
        Response
}

// DiscoveryNotification represents a discovery notification
type DiscoveryNotification struct {
        DiscoveryType discoverapi.DiscoveryType
        DiscoveryData interface{}
}

// DiscoveryResponse is used by libnetwork to log any plugin error processing the discovery notifications
type DiscoveryResponse struct {
        Response
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package remote

import (
        "context"
        "fmt"
        "maps"
        "net"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/discoverapi"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/drivers/remote/api"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/options"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
        "github.com/pkg/errors"
)

// remote driver must implement the discover-API.
var _ discoverapi.Discover = (*driver)(nil)

type driver struct {
        endpoint       *plugins.Client
        networkType    string
        gwAllocChecker bool
        nwEndpoints    map[string]*nwEndpoint // Set of endpoint ids that are currently acting as container gateways.
        nwEndpointsMu  sync.Mutex
}

// State info for an endpoint.
type nwEndpoint struct {
        sbOptions  map[string]any // Sandbox (container) options, from Join.
        isGateway4 bool           // Whether ProgramExternalConnectivity reported that this ep is a gateway.
        isGateway6 bool
}

type maybeError interface {
        GetError() string
}

func newDriver(name string, client *plugins.Client) *driver {
        return &driver{
                networkType: name,
                endpoint:    client,
                nwEndpoints: make(map[string]*nwEndpoint),
        }
}

// Register makes sure a remote driver is registered with r when a network
// driver plugin is activated.
func Register(r driverapi.Registerer, pg plugingetter.PluginGetter) error {
        newPluginHandler := func(name string, client *plugins.Client) {
                // negotiate driver capability with client
                d := newDriver(name, client)
                c, err := d.getCapabilities()
                if err != nil {
                        log.G(context.TODO()).Errorf("error getting capability for %s due to %v", name, err)
                        return
                }
                if err = r.RegisterDriver(name, d, *c); err != nil {
                        log.G(context.TODO()).Errorf("error registering driver for %s due to %v", name, err)
                }
        }

        // Unit test code is unaware of a true PluginStore. So we fall back to v1 plugins.
        handleFunc := plugins.Handle
        if pg != nil {
                handleFunc = pg.Handle
                activePlugins := pg.GetAllManagedPluginsByCap(driverapi.NetworkPluginEndpointType)
                for _, ap := range activePlugins {
                        client, err := getPluginClient(ap)
                        if err != nil {
                                return err
                        }
                        newPluginHandler(ap.Name(), client)
                }
        }
        handleFunc(driverapi.NetworkPluginEndpointType, newPluginHandler)

        return nil
}

func getPluginClient(p plugingetter.CompatPlugin) (*plugins.Client, error) {
        if v1, ok := p.(plugingetter.PluginWithV1Client); ok {
                return v1.Client(), nil
        }

        pa, ok := p.(plugingetter.PluginAddr)
        if !ok {
                return nil, errors.Errorf("unknown plugin type %T", p)
        }

        if pa.Protocol() != plugins.ProtocolSchemeHTTPV1 {
                return nil, errors.Errorf("unsupported plugin protocol %s", pa.Protocol())
        }

        addr := pa.Addr()
        client, err := plugins.NewClientWithTimeout(addr.Network()+"://"+addr.String(), nil, pa.Timeout())
        if err != nil {
                return nil, errors.Wrap(err, "error creating plugin client")
        }
        return client, nil
}

// Get capability from client
func (d *driver) getCapabilities() (*driverapi.Capability, error) {
        var capResp api.GetCapabilityResponse
        if err := d.call("GetCapabilities", nil, &capResp); err != nil {
                return nil, err
        }

        c := &driverapi.Capability{}
        switch capResp.Scope {
        case scope.Global, scope.Local:
                c.DataScope = capResp.Scope
        default:
                return nil, fmt.Errorf("invalid capability: expecting 'local' or 'global', got %s", capResp.Scope)
        }

        switch capResp.ConnectivityScope {
        case scope.Global, scope.Local:
                c.ConnectivityScope = capResp.ConnectivityScope
        case "":
                c.ConnectivityScope = c.DataScope
        default:
                return nil, fmt.Errorf("invalid capability: expecting 'local' or 'global', got %s", capResp.Scope)
        }

        d.gwAllocChecker = capResp.GwAllocChecker

        return c, nil
}

// Config is not implemented for remote drivers, since it is assumed
// to be supplied to the remote process out-of-band (e.g., as command
// line arguments).
func (d *driver) Config(option map[string]interface{}) error {
        return &driverapi.ErrNotImplemented{}
}

func (d *driver) call(methodName string, arg interface{}, retVal maybeError) error {
        method := driverapi.NetworkPluginEndpointType + "." + methodName
        err := d.endpoint.Call(method, arg, retVal)
        if err != nil {
                return err
        }
        if e := retVal.GetError(); e != "" {
                return fmt.Errorf("remote: %s", e)
        }
        return nil
}

func (d *driver) NetworkAllocate(id string, options map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
        create := &api.AllocateNetworkRequest{
                NetworkID: id,
                Options:   options,
                IPv4Data:  ipV4Data,
                IPv6Data:  ipV6Data,
        }
        retVal := api.AllocateNetworkResponse{}
        err := d.call("AllocateNetwork", create, &retVal)
        return retVal.Options, err
}

func (d *driver) NetworkFree(id string) error {
        fr := &api.FreeNetworkRequest{NetworkID: id}
        return d.call("FreeNetwork", fr, &api.FreeNetworkResponse{})
}

func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key string, value []byte) {
}

func (d *driver) DecodeTableEntry(tablename string, key string, value []byte) (string, map[string]string) {
        return "", nil
}

func (d *driver) CreateNetwork(ctx context.Context, id string, options map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
        create := &api.CreateNetworkRequest{
                NetworkID: id,
                Options:   options,
                IPv4Data:  ipV4Data,
                IPv6Data:  ipV6Data,
        }
        return d.call("CreateNetwork", create, &api.CreateNetworkResponse{})
}

func (d *driver) GetSkipGwAlloc(opts options.Generic) (ipv4, ipv6 bool, _ error) {
        if !d.gwAllocChecker {
                return false, false, nil
        }
        resp := &api.GwAllocCheckerResponse{}
        if err := d.call("GwAllocCheck", &api.GwAllocCheckerRequest{Options: opts}, resp); err != nil {
                return false, false, err
        }
        return resp.SkipIPv4, resp.SkipIPv6, nil
}

func (d *driver) DeleteNetwork(nid string) error {
        return d.call("DeleteNetwork", &api.DeleteNetworkRequest{NetworkID: nid}, &api.DeleteNetworkResponse{})
}

func (d *driver) CreateEndpoint(_ context.Context, nid, eid string, ifInfo driverapi.InterfaceInfo, epOptions map[string]interface{}) (retErr error) {
        if ifInfo == nil {
                return errors.New("must not be called with nil InterfaceInfo")
        }

        reqIface := &api.EndpointInterface{}
        if ifInfo.Address() != nil {
                reqIface.Address = ifInfo.Address().String()
        }
        if ifInfo.AddressIPv6() != nil {
                reqIface.AddressIPv6 = ifInfo.AddressIPv6().String()
        }
        if ifInfo.MacAddress() != nil {
                reqIface.MacAddress = ifInfo.MacAddress().String()
        }

        create := &api.CreateEndpointRequest{
                NetworkID:  nid,
                EndpointID: eid,
                Interface:  reqIface,
                Options:    epOptions,
        }
        var res api.CreateEndpointResponse
        if err := d.call("CreateEndpoint", create, &res); err != nil {
                return err
        }

        defer func() {
                if retErr != nil {
                        if err := d.DeleteEndpoint(nid, eid); err != nil {
                                retErr = fmt.Errorf("%w; failed to roll back: %w", err, retErr)
                        } else {
                                retErr = fmt.Errorf("%w; rolled back", retErr)
                        }
                }
        }()

        inIface, err := parseInterface(res)
        if err != nil {
                return err
        }
        if inIface == nil {
                // Remote driver did not set any field
                return nil
        }

        if inIface.MacAddress != nil {
                if err := ifInfo.SetMacAddress(inIface.MacAddress); err != nil {
                        return fmt.Errorf("driver modified interface MAC address: %v", err)
                }
        }
        if inIface.Address != nil {
                if err := ifInfo.SetIPAddress(inIface.Address); err != nil {
                        return fmt.Errorf("driver modified interface address: %v", err)
                }
        }
        if inIface.AddressIPv6 != nil {
                if err := ifInfo.SetIPAddress(inIface.AddressIPv6); err != nil {
                        return fmt.Errorf("driver modified interface address: %v", err)
                }
        }

        return nil
}

func (d *driver) DeleteEndpoint(nid, eid string) error {
        deleteRequest := &api.DeleteEndpointRequest{
                NetworkID:  nid,
                EndpointID: eid,
        }
        return d.call("DeleteEndpoint", deleteRequest, &api.DeleteEndpointResponse{})
}

func (d *driver) EndpointOperInfo(nid, eid string) (map[string]interface{}, error) {
        info := &api.EndpointInfoRequest{
                NetworkID:  nid,
                EndpointID: eid,
        }
        var res api.EndpointInfoResponse
        if err := d.call("EndpointOperInfo", info, &res); err != nil {
                return nil, err
        }
        return res.Value, nil
}

// Join method is invoked when a Sandbox is attached to an endpoint.
func (d *driver) Join(_ context.Context, nid, eid string, sboxKey string, jinfo driverapi.JoinInfo, _, options map[string]interface{}) (retErr error) {
        join := &api.JoinRequest{
                NetworkID:  nid,
                EndpointID: eid,
                SandboxKey: sboxKey,
                Options:    options,
        }
        var (
                res api.JoinResponse
                err error
        )
        if err = d.call("Join", join, &res); err != nil {
                return err
        }

        defer func() {
                if retErr != nil {
                        if err := d.Leave(nid, eid); err != nil {
                                retErr = fmt.Errorf("%w; failed to roll back: %w", err, retErr)
                        } else {
                                retErr = fmt.Errorf("%w; rolled back", retErr)
                        }
                }
        }()

        ifaceName := res.InterfaceName
        if iface := jinfo.InterfaceName(); iface != nil && ifaceName != nil {
                if err := iface.SetNames(ifaceName.SrcName, ifaceName.DstPrefix, ""); err != nil {
                        return fmt.Errorf("failed to set interface name: %s", err)
                }
        }

        var addr net.IP
        if res.Gateway != "" {
                if addr = net.ParseIP(res.Gateway); addr == nil {
                        return fmt.Errorf(`unable to parse Gateway "%s"`, res.Gateway)
                }
                if jinfo.SetGateway(addr) != nil {
                        return fmt.Errorf("failed to set gateway: %v", addr)
                }
        }
        if res.GatewayIPv6 != "" {
                if addr = net.ParseIP(res.GatewayIPv6); addr == nil {
                        return fmt.Errorf(`unable to parse GatewayIPv6 "%s"`, res.GatewayIPv6)
                }
                if jinfo.SetGatewayIPv6(addr) != nil {
                        return fmt.Errorf("failed to set gateway IPv6: %v", addr)
                }
        }
        if len(res.StaticRoutes) > 0 {
                routes, err := parseStaticRoutes(res)
                if err != nil {
                        return err
                }
                for _, route := range routes {
                        if jinfo.AddStaticRoute(route.Destination, route.RouteType, route.NextHop) != nil {
                                return fmt.Errorf("failed to set static route: %v", route)
                        }
                }
        }
        if res.DisableGatewayService {
                jinfo.DisableGatewayService()
        }

        d.nwEndpointsMu.Lock()
        defer d.nwEndpointsMu.Unlock()
        d.nwEndpoints[eid] = &nwEndpoint{sbOptions: options}
        return nil
}

// Leave method is invoked when a Sandbox detaches from an endpoint.
func (d *driver) Leave(nid, eid string) error {
        leave := &api.LeaveRequest{
                NetworkID:  nid,
                EndpointID: eid,
        }
        if err := d.call("Leave", leave, &api.LeaveResponse{}); err != nil {
                return err
        }
        d.nwEndpointsMu.Lock()
        defer d.nwEndpointsMu.Unlock()
        delete(d.nwEndpoints, eid)
        return nil
}

// ProgramExternalConnectivity is invoked to program the rules to allow external connectivity for the endpoint.
func (d *driver) ProgramExternalConnectivity(_ context.Context, nid, eid string, gw4Id, gw6Id string) error {
        d.nwEndpointsMu.Lock()
        ep, ok := d.nwEndpoints[eid]
        d.nwEndpointsMu.Unlock()
        if !ok {
                return fmt.Errorf("remote network driver: endpoint %s not found", eid)
        }
        isGw4, isGw6 := gw4Id == eid, gw6Id == eid
        if ep.isGateway4 == isGw4 && ep.isGateway6 == isGw6 {
                return nil
        }
        if !isGw4 && !isGw6 {
                return d.revokeExternalConnectivity(nid, eid)
        }
        ep.isGateway4, ep.isGateway6 = isGw4, isGw6
        options := ep.sbOptions
        if !isGw6 && gw6Id != "" {
                // If there is an IPv6 gateway, but it's not eid, set NoProxy6To4. This label was
                // used to tell the bridge driver not to try to use the userland proxy for dual
                // stack port mappings between host IPv6 and container IPv4 (because a different
                // endpoint may be dealing with IPv6 host addresses). It was undocumented for the
                // remote driver, marked as being for internal use and subject to later removal.
                // But, preserve it here for now as there's no other way for a remote driver to
                // know it shouldn't try to deal with IPv6 in this case.
                options = maps.Clone(ep.sbOptions)
                options[netlabel.NoProxy6To4] = true
        }
        data := &api.ProgramExternalConnectivityRequest{
                NetworkID:  nid,
                EndpointID: eid,
                Options:    options,
        }
        err := d.call("ProgramExternalConnectivity", data, &api.ProgramExternalConnectivityResponse{})
        if err != nil && plugins.IsNotFound(err) {
                // It is not mandatory yet to support this method
                return nil
        }
        return err
}

// revokeExternalConnectivity method is invoked to remove any external connectivity programming related to the endpoint.
func (d *driver) revokeExternalConnectivity(nid, eid string) error {
        ep, ok := d.nwEndpoints[eid]
        d.nwEndpointsMu.Unlock()
        if !ok {
                return fmt.Errorf("remote network driver: endpoint %s not found", eid)
        }
        data := &api.RevokeExternalConnectivityRequest{
                NetworkID:  nid,
                EndpointID: eid,
        }
        ep.isGateway4, ep.isGateway6 = false, false
        err := d.call("RevokeExternalConnectivity", data, &api.RevokeExternalConnectivityResponse{})
        if err != nil && plugins.IsNotFound(err) {
                // It is not mandatory yet to support this method
                return nil
        }
        return err
}

func (d *driver) Type() string {
        return d.networkType
}

func (d *driver) IsBuiltIn() bool {
        return false
}

// DiscoverNew is a notification for a new discovery event, such as a new node joining a cluster
func (d *driver) DiscoverNew(dType discoverapi.DiscoveryType, data interface{}) error {
        if dType != discoverapi.NodeDiscovery {
                return nil
        }
        notif := &api.DiscoveryNotification{
                DiscoveryType: dType,
                DiscoveryData: data,
        }
        return d.call("DiscoverNew", notif, &api.DiscoveryResponse{})
}

// DiscoverDelete is a notification for a discovery delete event, such as a node leaving a cluster
func (d *driver) DiscoverDelete(dType discoverapi.DiscoveryType, data interface{}) error {
        if dType != discoverapi.NodeDiscovery {
                return nil
        }
        notif := &api.DiscoveryNotification{
                DiscoveryType: dType,
                DiscoveryData: data,
        }
        return d.call("DiscoverDelete", notif, &api.DiscoveryResponse{})
}

func parseStaticRoutes(r api.JoinResponse) ([]*types.StaticRoute, error) {
        routes := make([]*types.StaticRoute, len(r.StaticRoutes))
        for i, inRoute := range r.StaticRoutes {
                var err error
                outRoute := &types.StaticRoute{RouteType: inRoute.RouteType}

                if inRoute.Destination != "" {
                        if outRoute.Destination, err = types.ParseCIDR(inRoute.Destination); err != nil {
                                return nil, err
                        }
                }

                if inRoute.NextHop != "" {
                        outRoute.NextHop = net.ParseIP(inRoute.NextHop)
                        if outRoute.NextHop == nil {
                                return nil, fmt.Errorf("failed to parse nexthop IP %s", inRoute.NextHop)
                        }
                }

                routes[i] = outRoute
        }
        return routes, nil
}

// parseInterface validates all the parameters of an Interface and returns them.
func parseInterface(r api.CreateEndpointResponse) (*api.Interface, error) {
        var outIf *api.Interface

        inIf := r.Interface
        if inIf != nil {
                var err error
                outIf = &api.Interface{}
                if inIf.Address != "" {
                        if outIf.Address, err = types.ParseCIDR(inIf.Address); err != nil {
                                return nil, err
                        }
                }
                if inIf.AddressIPv6 != "" {
                        if outIf.AddressIPv6, err = types.ParseCIDR(inIf.AddressIPv6); err != nil {
                                return nil, err
                        }
                }
                if inIf.MacAddress != "" {
                        if outIf.MacAddress, err = net.ParseMAC(inIf.MacAddress); err != nil {
                                return nil, err
                        }
                }
        }

        return outIf, nil
}

package libnetwork

import (
        "context"
        "fmt"
        "os"

        "github.com/docker/docker/daemon/libnetwork/config"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge"
        "github.com/docker/docker/daemon/libnetwork/drivers/host"
        "github.com/docker/docker/daemon/libnetwork/drivers/ipvlan"
        "github.com/docker/docker/daemon/libnetwork/drivers/macvlan"
        "github.com/docker/docker/daemon/libnetwork/drivers/null"
        "github.com/docker/docker/daemon/libnetwork/drivers/overlay"
        "github.com/docker/docker/daemon/libnetwork/drvregistry"
        "github.com/docker/docker/daemon/libnetwork/internal/rlkclient"
        "github.com/docker/docker/daemon/libnetwork/portmapper"
        "github.com/docker/docker/daemon/libnetwork/portmappers/nat"
        "github.com/docker/docker/daemon/libnetwork/portmappers/routed"
        "github.com/docker/docker/daemon/libnetwork/types"
)

func registerNetworkDrivers(r driverapi.Registerer, store *datastore.Store, pms *drvregistry.PortMappers, driverConfig func(string) map[string]interface{}) error {
        for _, nr := range []struct {
                ntype    string
                register func(driverapi.Registerer, *datastore.Store, map[string]interface{}) error
        }{
                {ntype: bridge.NetworkType, register: func(r driverapi.Registerer, store *datastore.Store, cfg map[string]interface{}) error {
                        return bridge.Register(r, store, pms, cfg)
                }},
                {ntype: host.NetworkType, register: func(r driverapi.Registerer, _ *datastore.Store, _ map[string]interface{}) error {
                        return host.Register(r)
                }},
                {ntype: ipvlan.NetworkType, register: ipvlan.Register},
                {ntype: macvlan.NetworkType, register: macvlan.Register},
                {ntype: null.NetworkType, register: func(r driverapi.Registerer, _ *datastore.Store, _ map[string]interface{}) error {
                        return null.Register(r)
                }},
                {ntype: overlay.NetworkType, register: func(r driverapi.Registerer, _ *datastore.Store, config map[string]interface{}) error {
                        return overlay.Register(r, config)
                }},
        } {
                if err := nr.register(r, store, driverConfig(nr.ntype)); err != nil {
                        return fmt.Errorf("failed to register %q driver: %w", nr.ntype, err)
                }
        }

        return nil
}

func registerPortMappers(ctx context.Context, r *drvregistry.PortMappers, cfg *config.Config) error {
        var pdc *rlkclient.PortDriverClient
        if cfg.Rootless {
                var err error
                pdc, err = rlkclient.NewPortDriverClient(ctx)
                if err != nil {
                        return fmt.Errorf("failed to create port driver client: %w", err)
                }
        }

        if err := nat.Register(r, nat.Config{
                RlkClient: pdc,
                StartProxy: func(pb types.PortBinding, file *os.File) (func() error, error) {
                        return portmapper.StartProxy(pb, cfg.UserlandProxyPath, file)
                },
                EnableProxy: cfg.EnableUserlandProxy && cfg.UserlandProxyPath != "",
        }); err != nil {
                return fmt.Errorf("registering nat portmapper: %w", err)
        }

        if err := routed.Register(r); err != nil {
                return fmt.Errorf("registering routed portmapper: %w", err)
        }

        return nil
}

package drvregistry

import (
        "errors"
        "strings"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/types"
)

type ipamDriver struct {
        driver     ipamapi.Ipam
        capability *ipamapi.Capability
}

// IPAMs is a registry of IPAM drivers. The zero value is an empty IPAM driver
// registry, ready to use.
type IPAMs struct {
        mu      sync.Mutex
        drivers map[string]ipamDriver
}

var _ ipamapi.Registerer = (*IPAMs)(nil)

// IPAM returns the actual IPAM driver instance and its capability which registered with the passed name.
func (ir *IPAMs) IPAM(name string) (ipamapi.Ipam, *ipamapi.Capability) {
        ir.mu.Lock()
        defer ir.mu.Unlock()

        d := ir.drivers[name]
        return d.driver, d.capability
}

// RegisterIpamDriverWithCapabilities registers the IPAM driver discovered with specified capabilities.
func (ir *IPAMs) RegisterIpamDriverWithCapabilities(name string, driver ipamapi.Ipam, caps *ipamapi.Capability) error {
        if strings.TrimSpace(name) == "" {
                return errors.New("ipam driver name string cannot be empty")
        }

        ir.mu.Lock()
        defer ir.mu.Unlock()

        dd, ok := ir.drivers[name]
        if ok && dd.driver.IsBuiltIn() {
                return types.ForbiddenErrorf("ipam driver %q already registered", name)
        }

        if ir.drivers == nil {
                ir.drivers = make(map[string]ipamDriver)
        }
        ir.drivers[name] = ipamDriver{driver: driver, capability: caps}

        return nil
}

// RegisterIpamDriver registers the IPAM driver discovered with default capabilities.
func (ir *IPAMs) RegisterIpamDriver(name string, driver ipamapi.Ipam) error {
        return ir.RegisterIpamDriverWithCapabilities(name, driver, &ipamapi.Capability{})
}

// IPAMWalkFunc defines the IPAM driver table walker function signature.
type IPAMWalkFunc func(name string, driver ipamapi.Ipam, capability *ipamapi.Capability) bool

// WalkIPAMs walks the IPAM drivers registered in the registry and invokes the passed walk function and each one of them.
func (ir *IPAMs) WalkIPAMs(ifn IPAMWalkFunc) {
        type ipamVal struct {
                name string
                data ipamDriver
        }

        ir.mu.Lock()
        ivl := make([]ipamVal, 0, len(ir.drivers))
        for k, v := range ir.drivers {
                ivl = append(ivl, ipamVal{name: k, data: v})
        }
        ir.mu.Unlock()

        for _, iv := range ivl {
                if ifn(iv.name, iv.data.driver, iv.data.capability) {
                        break
                }
        }
}

package drvregistry

import (
        "errors"
        "strings"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/driverapi"
)

// DriverWalkFunc defines the network driver table walker function signature.
type DriverWalkFunc func(name string, driver driverapi.Driver, capability driverapi.Capability) bool

type driverData struct {
        driver     driverapi.Driver
        capability driverapi.Capability
}

// Networks is a registry of network drivers. The zero value is an empty network
// driver registry, ready to use.
type Networks struct {
        // Notify is called whenever a network driver is registered.
        Notify driverapi.Registerer

        mu      sync.Mutex
        drivers map[string]driverData
}

var _ driverapi.Registerer = (*Networks)(nil)

// WalkDrivers walks the network drivers registered in the registry and invokes the passed walk function and each one of them.
func (nr *Networks) WalkDrivers(dfn DriverWalkFunc) {
        type driverVal struct {
                name string
                data driverData
        }

        nr.mu.Lock()
        dvl := make([]driverVal, 0, len(nr.drivers))
        for k, v := range nr.drivers {
                dvl = append(dvl, driverVal{name: k, data: v})
        }
        nr.mu.Unlock()

        for _, dv := range dvl {
                if dfn(dv.name, dv.data.driver, dv.data.capability) {
                        break
                }
        }
}

// Driver returns the network driver instance registered under name, and its capability.
func (nr *Networks) Driver(name string) (driverapi.Driver, driverapi.Capability) {
        nr.mu.Lock()
        defer nr.mu.Unlock()

        d := nr.drivers[name]
        return d.driver, d.capability
}

// RegisterDriver registers the network driver with nr.
func (nr *Networks) RegisterDriver(ntype string, driver driverapi.Driver, capability driverapi.Capability) error {
        if strings.TrimSpace(ntype) == "" {
                return errors.New("network type string cannot be empty")
        }

        nr.mu.Lock()
        dd, ok := nr.drivers[ntype]
        nr.mu.Unlock()

        if ok && dd.driver.IsBuiltIn() {
                return driverapi.ErrActiveRegistration(ntype)
        }

        if nr.Notify != nil {
                if err := nr.Notify.RegisterDriver(ntype, driver, capability); err != nil {
                        return err
                }
        }

        nr.mu.Lock()
        defer nr.mu.Unlock()

        if nr.drivers == nil {
                nr.drivers = make(map[string]driverData)
        }
        nr.drivers[ntype] = driverData{driver: driver, capability: capability}

        return nil
}

package drvregistry

import (
        "errors"
        "fmt"
        "strings"

        "github.com/docker/docker/daemon/libnetwork/portmapperapi"
)

type PortMappers struct {
        drivers map[string]portmapperapi.PortMapper
}

// Register a portmapper with the registry.
func (r *PortMappers) Register(name string, pm portmapperapi.PortMapper) error {
        if strings.TrimSpace(name) == "" {
                return errors.New("portmapper name cannot be empty")
        }

        if _, ok := r.drivers[name]; ok {
                return errors.New("portmapper already registered")
        }

        if r.drivers == nil {
                r.drivers = make(map[string]portmapperapi.PortMapper)
        }

        r.drivers[name] = pm

        return nil
}

// Get retrieves a portmapper by name from the registry.
func (r *PortMappers) Get(name string) (portmapperapi.PortMapper, error) {
        pm, ok := r.drivers[name]
        if !ok {
                return nil, fmt.Errorf("portmapper %s not found", name)
        }
        return pm, nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package libnetwork

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "net/netip"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/options"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/internal/sliceutil"
        "github.com/docker/docker/pkg/stringid"
        "go.opentelemetry.io/otel"
)

// ByNetworkType sorts a [Endpoint] slice based on the network-type
// they're attached to. It implements [sort.Interface] and can be used
// with [sort.Stable] or [sort.Sort]. It is used by [Sandbox.ResolveName]
// when resolving names in swarm mode. In swarm mode, services with exposed
// ports are connected to user overlay network, ingress network, and local
// ("docker_gwbridge") networks. Name resolution should prioritize returning
// the VIP/IPs on user overlay network over ingress and local networks.
//
// ByNetworkType re-orders the endpoints based on the network-type they
// are attached to:
//
//  1. dynamic networks (user overlay networks)
//  2. ingress network(s)
//  3. local networks ("docker_gwbridge")
type ByNetworkType []*Endpoint

func (ep ByNetworkType) Len() int      { return len(ep) }
func (ep ByNetworkType) Swap(i, j int) { ep[i], ep[j] = ep[j], ep[i] }
func (ep ByNetworkType) Less(i, j int) bool {
        return getNetworkType(ep[i].getNetwork()) < getNetworkType(ep[j].getNetwork())
}

// Define the order in which resolution should happen if an endpoint is
// attached to multiple network-types. It is used by [ByNetworkType].
const (
        typeDynamic = iota
        typeIngress
        typeLocal
)

func getNetworkType(nw *Network) int {
        switch {
        case nw.ingress:
                return typeIngress
        case nw.dynamic:
                return typeDynamic
        default:
                return typeLocal
        }
}

// EndpointOption is an option setter function type used to pass various options to Network
// and Endpoint interfaces methods. The various setter functions of type EndpointOption are
// provided by libnetwork, they look like <Create|Join|Leave>Option[...](...)
type EndpointOption func(ep *Endpoint)

// Endpoint represents a logical connection between a network and a sandbox.
type Endpoint struct {
        name         string
        id           string
        network      *Network
        iface        *EndpointInterface
        joinInfo     *endpointJoinInfo
        sandboxID    string
        exposedPorts []types.TransportPort
        // dnsNames holds all the non-fully qualified DNS names associated to this endpoint. Order matters: first entry
        // will be used for the PTR records associated to the endpoint's IPv4 and IPv6 addresses.
        dnsNames          []string
        disableResolution bool
        disableIPv6       bool
        generic           map[string]any
        prefAddress       net.IP
        prefAddressV6     net.IP
        ipamOptions       map[string]string
        aliases           map[string]string
        svcID             string
        svcName           string
        virtualIP         net.IP
        svcAliases        []string
        ingressPorts      []*PortConfig
        dbIndex           uint64
        dbExists          bool
        serviceEnabled    bool
        loadBalancer      bool
        mu                sync.Mutex
}

func (ep *Endpoint) MarshalJSON() ([]byte, error) {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        epMap := make(map[string]any)
        epMap["name"] = ep.name
        epMap["id"] = ep.id
        epMap["ep_iface"] = ep.iface
        epMap["joinInfo"] = ep.joinInfo
        epMap["exposed_ports"] = ep.exposedPorts
        if ep.generic != nil {
                epMap["generic"] = ep.generic
        }
        epMap["sandbox"] = ep.sandboxID
        epMap["dnsNames"] = ep.dnsNames
        epMap["disableResolution"] = ep.disableResolution
        epMap["disableIPv6"] = ep.disableIPv6
        epMap["svcName"] = ep.svcName
        epMap["svcID"] = ep.svcID
        epMap["virtualIP"] = ep.virtualIP.String()
        epMap["ingressPorts"] = ep.ingressPorts
        epMap["svcAliases"] = ep.svcAliases
        epMap["loadBalancer"] = ep.loadBalancer

        return json.Marshal(epMap)
}

func (ep *Endpoint) UnmarshalJSON(b []byte) (err error) {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        var epMap map[string]any
        if err := json.Unmarshal(b, &epMap); err != nil {
                return err
        }
        ep.name = epMap["name"].(string)
        ep.id = epMap["id"].(string)

        // TODO(cpuguy83): So yeah, this isn't checking any errors anywhere.
        // Seems like we should be checking errors even because of memory related issues that can arise.
        // Alas it seems like given the nature of this data we could introduce problems if we start checking these errors.
        //
        // If anyone ever comes here and figures out one way or another if we can/should be checking these errors and it turns out we can't... then please document *why*

        ib, _ := json.Marshal(epMap["ep_iface"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        _ = json.Unmarshal(ib, &ep.iface)        //nolint:errcheck

        jb, _ := json.Marshal(epMap["joinInfo"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        _ = json.Unmarshal(jb, &ep.joinInfo)     //nolint:errcheck

        tb, _ := json.Marshal(epMap["exposed_ports"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        var tPorts []types.TransportPort
        _ = json.Unmarshal(tb, &tPorts) //nolint:errcheck
        ep.exposedPorts = tPorts

        cb, _ := json.Marshal(epMap["sandbox"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        _ = json.Unmarshal(cb, &ep.sandboxID)   //nolint:errcheck

        if v, ok := epMap["generic"]; ok {
                ep.generic = v.(map[string]any)

                if opt, ok := ep.generic[netlabel.PortMap]; ok {
                        pblist := []types.PortBinding{}

                        for i := 0; i < len(opt.([]any)); i++ {
                                pb := types.PortBinding{}
                                tmp := opt.([]any)[i].(map[string]any)

                                bytes, err := json.Marshal(tmp)
                                if err != nil {
                                        log.G(context.TODO()).Error(err)
                                        break
                                }
                                err = json.Unmarshal(bytes, &pb)
                                if err != nil {
                                        log.G(context.TODO()).Error(err)
                                        break
                                }
                                pblist = append(pblist, pb)
                        }
                        ep.generic[netlabel.PortMap] = pblist
                }

                if opt, ok := ep.generic[netlabel.ExposedPorts]; ok {
                        tplist := []types.TransportPort{}

                        for i := 0; i < len(opt.([]any)); i++ {
                                tp := types.TransportPort{}
                                tmp := opt.([]any)[i].(map[string]any)

                                bytes, err := json.Marshal(tmp)
                                if err != nil {
                                        log.G(context.TODO()).Error(err)
                                        break
                                }
                                err = json.Unmarshal(bytes, &tp)
                                if err != nil {
                                        log.G(context.TODO()).Error(err)
                                        break
                                }
                                tplist = append(tplist, tp)
                        }
                        ep.generic[netlabel.ExposedPorts] = tplist
                }
        }

        var anonymous bool
        if v, ok := epMap["anonymous"]; ok {
                anonymous = v.(bool)
        }
        if v, ok := epMap["disableResolution"]; ok {
                ep.disableResolution = v.(bool)
        }
        if v, ok := epMap["disableIPv6"]; ok {
                ep.disableIPv6 = v.(bool)
        }

        if sn, ok := epMap["svcName"]; ok {
                ep.svcName = sn.(string)
        }

        if si, ok := epMap["svcID"]; ok {
                ep.svcID = si.(string)
        }

        if vip, ok := epMap["virtualIP"]; ok {
                ep.virtualIP = net.ParseIP(vip.(string))
        }

        if v, ok := epMap["loadBalancer"]; ok {
                ep.loadBalancer = v.(bool)
        }

        sal, _ := json.Marshal(epMap["svcAliases"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        var svcAliases []string
        _ = json.Unmarshal(sal, &svcAliases) //nolint:errcheck
        ep.svcAliases = svcAliases

        pc, _ := json.Marshal(epMap["ingressPorts"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        var ingressPorts []*PortConfig
        _ = json.Unmarshal(pc, &ingressPorts) //nolint:errcheck
        ep.ingressPorts = ingressPorts

        ma, _ := json.Marshal(epMap["myAliases"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        var myAliases []string
        _ = json.Unmarshal(ma, &myAliases) //nolint:errcheck

        _, hasDNSNames := epMap["dnsNames"]
        dn, _ := json.Marshal(epMap["dnsNames"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        var dnsNames []string
        _ = json.Unmarshal(dn, &dnsNames) //nolint:errcheck
        ep.dnsNames = dnsNames

        // TODO(aker): remove this migration code in v27
        if !hasDNSNames {
                // The field dnsNames was introduced in v25.0. If we don't have it, the on-disk state was written by an older
                // daemon, thus we need to populate dnsNames based off of myAliases and anonymous values.
                if !anonymous {
                        myAliases = append([]string{ep.name}, myAliases...)
                }
                ep.dnsNames = sliceutil.Dedup(myAliases)
        }

        return nil
}

func (ep *Endpoint) New() datastore.KVObject {
        return &Endpoint{network: ep.getNetwork()}
}

func (ep *Endpoint) CopyTo(o datastore.KVObject) error {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        dstEp := o.(*Endpoint)
        dstEp.name = ep.name
        dstEp.id = ep.id
        dstEp.sandboxID = ep.sandboxID
        dstEp.dbIndex = ep.dbIndex
        dstEp.dbExists = ep.dbExists
        dstEp.disableResolution = ep.disableResolution
        dstEp.disableIPv6 = ep.disableIPv6
        dstEp.svcName = ep.svcName
        dstEp.svcID = ep.svcID
        dstEp.virtualIP = ep.virtualIP
        dstEp.loadBalancer = ep.loadBalancer

        dstEp.svcAliases = make([]string, len(ep.svcAliases))
        copy(dstEp.svcAliases, ep.svcAliases)

        dstEp.ingressPorts = make([]*PortConfig, len(ep.ingressPorts))
        copy(dstEp.ingressPorts, ep.ingressPorts)

        if ep.iface != nil {
                dstEp.iface = &EndpointInterface{}
                if err := ep.iface.CopyTo(dstEp.iface); err != nil {
                        return err
                }
        }

        if ep.joinInfo != nil {
                dstEp.joinInfo = &endpointJoinInfo{}
                if err := ep.joinInfo.CopyTo(dstEp.joinInfo); err != nil {
                        return err
                }
        }

        dstEp.exposedPorts = make([]types.TransportPort, len(ep.exposedPorts))
        copy(dstEp.exposedPorts, ep.exposedPorts)

        dstEp.dnsNames = make([]string, len(ep.dnsNames))
        copy(dstEp.dnsNames, ep.dnsNames)

        dstEp.generic = options.Generic{}
        for k, v := range ep.generic {
                dstEp.generic[k] = v
        }

        return nil
}

// ID returns the system-generated id for this endpoint.
func (ep *Endpoint) ID() string {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        return ep.id
}

// Name returns the name of this endpoint.
func (ep *Endpoint) Name() string {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        return ep.name
}

// Network returns the name of the network to which this endpoint is attached.
func (ep *Endpoint) Network() string {
        if ep.network == nil {
                return ""
        }

        return ep.network.name
}

// getDNSNames returns a copy of the DNS names associated to this endpoint. The first entry is the one used for PTR
// records.
func (ep *Endpoint) getDNSNames() []string {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        dnsNames := make([]string, len(ep.dnsNames))
        copy(dnsNames, ep.dnsNames)
        return dnsNames
}

// isServiceEnabled check if service is enabled on the endpoint
func (ep *Endpoint) isServiceEnabled() bool {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        return ep.serviceEnabled
}

// enableService sets service enabled on the endpoint
func (ep *Endpoint) enableService() {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        ep.serviceEnabled = true
}

// disableService disables service on the endpoint
func (ep *Endpoint) disableService() {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        ep.serviceEnabled = false
}

func (ep *Endpoint) needResolver() bool {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        return !ep.disableResolution
}

// Key returns the endpoint's key.
//
// Key structure: endpoint/network-id/endpoint-id
func (ep *Endpoint) Key() []string {
        if ep.network == nil {
                return nil
        }

        return []string{datastore.EndpointKeyPrefix, ep.network.id, ep.id}
}

func (ep *Endpoint) KeyPrefix() []string {
        if ep.network == nil {
                return nil
        }

        return []string{datastore.EndpointKeyPrefix, ep.network.id}
}

func (ep *Endpoint) Value() []byte {
        b, err := json.Marshal(ep)
        if err != nil {
                return nil
        }
        return b
}

func (ep *Endpoint) getSysctls() []string {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        if s, ok := ep.generic[netlabel.EndpointSysctls]; ok {
                if ss, ok := s.(string); ok {
                        return strings.Split(ss, ",")
                }
        }
        return nil
}

func (ep *Endpoint) SetValue(value []byte) error {
        return json.Unmarshal(value, ep)
}

func (ep *Endpoint) Index() uint64 {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        return ep.dbIndex
}

func (ep *Endpoint) SetIndex(index uint64) {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        ep.dbIndex = index
        ep.dbExists = true
}

func (ep *Endpoint) Exists() bool {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        return ep.dbExists
}

func (ep *Endpoint) Skip() bool {
        return ep.getNetwork().Skip()
}

func (ep *Endpoint) processOptions(options ...EndpointOption) {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        for _, opt := range options {
                if opt != nil {
                        opt(ep)
                }
        }
}

func (ep *Endpoint) getNetwork() *Network {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        return ep.network
}

func (ep *Endpoint) getNetworkFromStore() (*Network, error) {
        if ep.network == nil {
                return nil, fmt.Errorf("invalid network object in endpoint %s", ep.Name())
        }

        return ep.network.getController().getNetworkFromStore(ep.network.id)
}

// Join joins the sandbox to the endpoint and populates into the sandbox
// the network resources allocated for the endpoint.
func (ep *Endpoint) Join(ctx context.Context, sb *Sandbox, options ...EndpointOption) error {
        if sb == nil || sb.ID() == "" || sb.Key() == "" {
                return types.InvalidParameterErrorf("invalid Sandbox passed to endpoint join: %v", sb)
        }

        sb.joinLeaveMu.Lock()
        defer sb.joinLeaveMu.Unlock()

        return ep.sbJoin(ctx, sb, options...)
}

func epId(ep *Endpoint) string {
        if ep == nil {
                return ""
        }
        return ep.id
}

func epShortId(ep *Endpoint) string {
        return stringid.TruncateID(epId(ep))
}

func (ep *Endpoint) sbJoin(ctx context.Context, sb *Sandbox, options ...EndpointOption) (retErr error) {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.sbJoin")
        defer span.End()

        n, err := ep.getNetworkFromStore()
        if err != nil {
                return fmt.Errorf("failed to get network from store during join: %v", err)
        }

        ep, err = n.getEndpointFromStore(ep.ID())
        if err != nil {
                return fmt.Errorf("failed to get endpoint from store during join: %v", err)
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                "nid": stringid.TruncateID(n.ID()),
                "net": n.Name(),
                "eid": stringid.TruncateID(ep.ID()),
                "ep":  ep.Name(),
        }))

        ep.mu.Lock()
        if ep.sandboxID != "" {
                ep.mu.Unlock()
                return types.ForbiddenErrorf("another container is attached to the same network endpoint")
        }
        ep.network = n
        ep.sandboxID = sb.ID()
        ep.joinInfo = &endpointJoinInfo{}
        epid := ep.id
        ep.mu.Unlock()
        defer func() {
                if retErr != nil {
                        ep.mu.Lock()
                        ep.sandboxID = ""
                        ep.mu.Unlock()
                }
        }()

        nid := n.ID()

        ep.processOptions(options...)

        d, err := n.driver(true)
        if err != nil {
                return fmt.Errorf("failed to get driver during join: %v", err)
        }

        if err := d.Join(ctx, nid, epid, sb.Key(), ep, ep.generic, sb.Labels()); err != nil {
                return err
        }
        defer func() {
                if retErr != nil {
                        if e := d.Leave(nid, epid); e != nil {
                                log.G(ctx).Warnf("driver leave failed while rolling back join: %v", e)
                        }
                }
        }()

        // Discard the IPv6 gateway if the endpoint has no IPv6 address (because IPv6
        // is disabled in the container).
        if ep.iface.addrv6 == nil {
                ep.joinInfo.gw6 = nil
        }

        if !n.getController().isAgent() {
                if !n.getController().isSwarmNode() || n.Scope() != scope.Swarm || !n.driverIsMultihost() {
                        n.updateSvcRecord(context.WithoutCancel(ctx), ep, true)
                }
        }

        sb.addHostsEntries(ctx, ep.getEtcHostsAddrs())
        if err := sb.updateDNS(n.enableIPv6); err != nil {
                return err
        }

        // Current endpoint(s) providing external connectivity for the sandbox
        gwepBefore4, gwepBefore6 := sb.getGatewayEndpoint()

        sb.addEndpoint(ep)
        defer func() {
                if retErr != nil {
                        sb.removeEndpoint(ep)
                }
        }()

        if err := sb.populateNetworkResources(ctx, ep); err != nil {
                return err
        }

        if err := addEpToResolver(ctx, n.Name(), ep.Name(), &sb.config, ep.iface, n.Resolvers()); err != nil {
                return errdefs.System(err)
        }

        if err := n.getController().storeEndpoint(ctx, ep); err != nil {
                return err
        }

        if err := ep.addDriverInfoToCluster(); err != nil {
                return err
        }

        defer func() {
                if retErr != nil {
                        if e := ep.deleteDriverInfoFromCluster(); e != nil {
                                log.G(ctx).WithError(e).Error("Could not delete endpoint state from cluster on join failure")
                        }
                }
        }()

        // Load balancing endpoints should never have a default gateway nor
        // should they alter the status of a network's default gateway
        if ep.loadBalancer && !sb.ingress {
                return nil
        }

        if sb.needDefaultGW() && sb.getEndpointInGWNetwork() == nil {
                return sb.setupDefaultGW()
        }

        // Enable upstream forwarding if the sandbox gained external connectivity.
        if sb.resolver != nil {
                sb.resolver.SetForwardingPolicy(sb.hasExternalAccess())
        }

        gwepAfter4, gwepAfter6 := sb.getGatewayEndpoint()

        log.G(ctx).Infof("sbJoin: gwep4 '%s'->'%s', gwep6 '%s'->'%s'",
                epShortId(gwepBefore4), epShortId(gwepAfter4),
                epShortId(gwepBefore6), epShortId(gwepAfter6))

        // If ep has taken over as a gateway and there were gateways before, update them.
        if ep == gwepAfter4 || ep == gwepAfter6 {
                if gwepBefore4 != nil {
                        if err := gwepBefore4.programExternalConnectivity(ctx, gwepAfter4, gwepAfter6); err != nil {
                                return fmt.Errorf("updating external connectivity for IPv4 endpoint %s: %v", epShortId(gwepBefore4), err)
                        }
                        defer func() {
                                if retErr != nil {
                                        if err := gwepBefore4.programExternalConnectivity(ctx, gwepBefore4, gwepBefore6); err != nil {
                                                log.G(ctx).WithFields(log.Fields{
                                                        "error":     err,
                                                        "restoreEp": epShortId(gwepBefore4),
                                                }).Errorf("Failed to restore external IPv4 connectivity")
                                        }
                                }
                        }()
                }
                if gwepBefore6 != nil {
                        if err := gwepBefore6.programExternalConnectivity(ctx, gwepAfter4, gwepAfter6); err != nil {
                                return fmt.Errorf("updating external connectivity for IPv6 endpoint %s: %v", epShortId(gwepBefore6), err)
                        }
                        defer func() {
                                if retErr != nil {
                                        if err := gwepBefore6.programExternalConnectivity(ctx, gwepBefore4, gwepBefore6); err != nil {
                                                log.G(ctx).WithFields(log.Fields{
                                                        "error":     err,
                                                        "restoreEp": epShortId(gwepBefore6),
                                                }).Errorf("Failed to restore external IPv6 connectivity")
                                        }
                                }
                        }()
                }
        }

        // Tell the new endpoint whether it's a gateway.
        if err := ep.programExternalConnectivity(ctx, gwepAfter4, gwepAfter6); err != nil {
                return err
        }

        if !sb.needDefaultGW() {
                if e := sb.clearDefaultGW(); e != nil {
                        log.G(ctx).WithFields(log.Fields{
                                "error": e,
                                "sid":   sb.ID(),
                                "cid":   sb.ContainerID(),
                        }).Warn("Failure while disconnecting sandbox from gateway network")
                }
        }

        return nil
}

func (ep *Endpoint) programExternalConnectivity(ctx context.Context, gwep4, gwep6 *Endpoint) error {
        n, err := ep.getNetworkFromStore()
        if err != nil {
                return types.InternalErrorf("failed to get network from store for programming external connectivity: %v", err)
        }
        d, err := n.driver(true)
        if err != nil {
                return types.InternalErrorf("failed to get driver for programming external connectivity: %v", err)
        }
        if ecd, ok := d.(driverapi.ExtConner); ok {
                log.G(ctx).WithFields(log.Fields{
                        "ep":   ep.Name(),
                        "epid": epShortId(ep),
                        "gw4":  epShortId(gwep4),
                        "gw6":  epShortId(gwep6),
                }).Debug("Programming external connectivity on endpoint")
                if err := ecd.ProgramExternalConnectivity(context.WithoutCancel(ctx), n.ID(), ep.ID(), epId(gwep4), epId(gwep6)); err != nil {
                        return types.InternalErrorf("driver failed programming external connectivity on endpoint %s (%s): %v",
                                ep.Name(), ep.ID(), err)
                }
        }
        return nil
}

func (ep *Endpoint) rename(name string) error {
        ep.mu.Lock()
        ep.name = name
        ep.mu.Unlock()

        // Update the store with the updated name
        if err := ep.getNetwork().getController().storeEndpoint(context.TODO(), ep); err != nil {
                return err
        }

        return nil
}

func (ep *Endpoint) UpdateDNSNames(dnsNames []string) error {
        nw := ep.getNetwork()
        c := nw.getController()
        sb, ok := ep.getSandbox()
        if !ok {
                log.G(context.TODO()).WithFields(log.Fields{
                        "sandboxID":  ep.sandboxID,
                        "endpointID": ep.ID(),
                }).Warn("DNSNames update aborted, sandbox is not present anymore")
                return nil
        }

        if c.isAgent() {
                if err := ep.deleteServiceInfoFromCluster(sb, true, "UpdateDNSNames"); err != nil {
                        return types.InternalErrorf("could not delete service state for endpoint %s from cluster on UpdateDNSNames: %v", ep.Name(), err)
                }

                ep.dnsNames = dnsNames
                if err := ep.addServiceInfoToCluster(sb); err != nil {
                        return types.InternalErrorf("could not add service state for endpoint %s to cluster on UpdateDNSNames: %v", ep.Name(), err)
                }
        } else {
                nw.updateSvcRecord(context.WithoutCancel(context.TODO()), ep, false)

                ep.dnsNames = dnsNames
                nw.updateSvcRecord(context.WithoutCancel(context.TODO()), ep, true)
        }

        // Update the store with the updated name
        if err := c.storeEndpoint(context.TODO(), ep); err != nil {
                return err
        }

        return nil
}

func (ep *Endpoint) hasInterface(iName string) bool {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        return ep.iface != nil && ep.iface.srcName == iName
}

// Leave detaches the network resources populated in the sandbox.
func (ep *Endpoint) Leave(ctx context.Context, sb *Sandbox) error {
        if sb == nil || sb.ID() == "" || sb.Key() == "" {
                return types.InvalidParameterErrorf("invalid Sandbox passed to endpoint leave: %v", sb)
        }

        sb.joinLeaveMu.Lock()
        defer sb.joinLeaveMu.Unlock()

        return ep.sbLeave(ctx, sb, false)
}

func (ep *Endpoint) sbLeave(ctx context.Context, sb *Sandbox, force bool) error {
        n, err := ep.getNetworkFromStore()
        if err != nil {
                return fmt.Errorf("failed to get network from store during leave: %v", err)
        }

        ep, err = n.getEndpointFromStore(ep.ID())
        if err != nil {
                return fmt.Errorf("failed to get endpoint from store during leave: %v", err)
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                "nid": n.ID(),
                "net": n.Name(),
                "eid": ep.ID(),
                "ep":  ep.Name(),
        }))

        ep.mu.Lock()
        sid := ep.sandboxID
        ep.mu.Unlock()

        if sid == "" {
                return types.ForbiddenErrorf("cannot leave endpoint with no attached sandbox")
        }
        if sid != sb.ID() {
                return types.ForbiddenErrorf("unexpected sandbox ID in leave request. Expected %s. Got %s", ep.sandboxID, sb.ID())
        }

        d, err := n.driver(!force)
        if err != nil {
                return fmt.Errorf("failed to get driver during endpoint leave: %v", err)
        }

        ep.mu.Lock()
        ep.sandboxID = ""
        ep.network = n
        ep.mu.Unlock()

        if d != nil {
                if ecd, ok := d.(driverapi.ExtConner); ok {
                        if err := ecd.ProgramExternalConnectivity(context.WithoutCancel(ctx), n.ID(), ep.ID(), "", ""); err != nil {
                                log.G(ctx).WithError(err).Warn("driver failed revoking external connectivity on endpoint")
                        }
                }

                if err := d.Leave(n.id, ep.id); err != nil {
                        if _, ok := err.(types.MaskableError); !ok {
                                log.G(ctx).WithError(err).Warn("driver error disconnecting container")
                        }
                }
        }

        if err := ep.deleteServiceInfoFromCluster(sb, true, "sbLeave"); err != nil {
                log.G(ctx).WithError(err).Warn("Failed to clean up service info on container disconnect")
        }

        if err := deleteEpFromResolver(ep.Name(), ep.iface, n.Resolvers()); err != nil {
                log.G(ctx).WithError(err).Warn("Failed to clean up resolver info on container disconnect")
        }

        // Capture the addresses that were added to the container's /etc/hosts here,
        // before the endpoint is deleted, so that they can be removed from /etc/hosts.
        etcHostsAddrs := ep.getEtcHostsAddrs()

        if err := sb.clearNetworkResources(ep); err != nil {
                log.G(ctx).WithError(err).Warn("Failed to clean up network resources on container disconnect")
        }

        // Even if the interface was initially created in the container's namespace, it's
        // now been moved out. When a legacy link is deleted, the Endpoint is removed and
        // then re-added to the Sandbox. So, to make sure the re-add works, note that the
        // interface is now outside the container's netns.
        ep.iface.createdInContainer = false

        // Update the store about the sandbox detach only after we
        // have completed sb.clearNetworkResources above to avoid
        // spurious logs when cleaning up the sandbox when the daemon
        // ungracefully exits and restarts before completing sandbox
        // detach but after store has been updated.
        if err := n.getController().storeEndpoint(ctx, ep); err != nil {
                return err
        }

        if e := ep.deleteDriverInfoFromCluster(); e != nil {
                log.G(ctx).WithError(e).Error("Failed to delete endpoint state for endpoint from cluster")
        }

        // When a container is connected to a network, it gets /etc/hosts
        // entries for its addresses on that network. So, when it's connected
        // to two networks, it has a hosts entry for each. For example, if
        // the hostname is the default short-id, and it's connected to two
        // networks (172.19.0.0/16 and 172.20.0.0/17, plus IPv6 address for
        // each), the hosts file might include:
        //
        //   172.19.0.2        4b92a573912d
        //   fd8c:c894:d68::2        4b92a573912d
        //   172.20.0.2        4b92a573912d
        //   fd8c:c894:d68:1::2        4b92a573912d
        //
        // If the container is disconnected from 172.19.0.2, only remove
        // the hosts entries with addresses on that network.
        sb.deleteHostsEntries(etcHostsAddrs)

        if !sb.inDelete && sb.needDefaultGW() && sb.getEndpointInGWNetwork() == nil {
                return sb.setupDefaultGW()
        }

        // Disable upstream forwarding if the sandbox lost external connectivity.
        if sb.resolver != nil {
                sb.resolver.SetForwardingPolicy(sb.hasExternalAccess())
        }

        // Find new endpoint(s) to provide external connectivity for the sandbox.
        gwepAfter4, gwepAfter6 := sb.getGatewayEndpoint()
        if gwepAfter4 != nil {
                if err := gwepAfter4.programExternalConnectivity(ctx, gwepAfter4, gwepAfter6); err != nil {
                        log.G(ctx).WithError(err).Error("Failed to set IPv4 gateway")
                }
        }
        if gwepAfter6 != nil && gwepAfter6 != gwepAfter4 {
                if err := gwepAfter6.programExternalConnectivity(ctx, gwepAfter4, gwepAfter6); err != nil {
                        log.G(ctx).WithError(err).Error("Failed to set IPv6 gateway")
                }
        }

        if !sb.needDefaultGW() {
                if err := sb.clearDefaultGW(); err != nil {
                        log.G(ctx).WithFields(log.Fields{
                                "error": err,
                                "sid":   sb.ID(),
                                "cid":   sb.ContainerID(),
                        }).Warn("Failure while disconnecting sandbox from gateway network")
                }
        }

        return nil
}

// Delete deletes and detaches this endpoint from the network.
func (ep *Endpoint) Delete(ctx context.Context, force bool) error {
        var err error
        n, err := ep.getNetworkFromStore()
        if err != nil {
                return fmt.Errorf("failed to get network during Delete: %v", err)
        }

        ep, err = n.getEndpointFromStore(ep.ID())
        if err != nil {
                return fmt.Errorf("failed to get endpoint from store during Delete: %v", err)
        }

        ep.mu.Lock()
        epid := ep.id
        name := ep.name
        sbid := ep.sandboxID
        ep.mu.Unlock()

        sb, _ := n.getController().SandboxByID(sbid)
        if sb != nil && !force {
                return &ActiveContainerError{name: name, id: epid}
        }

        if sb != nil {
                if e := ep.sbLeave(context.WithoutCancel(ctx), sb, force); e != nil {
                        log.G(ctx).Warnf("failed to leave sandbox for endpoint %s : %v", name, e)
                }
        }

        if err = n.getController().deleteStoredEndpoint(ep); err != nil {
                return err
        }

        defer func() {
                if err != nil && !force {
                        ep.dbExists = false
                        if e := n.getController().storeEndpoint(context.WithoutCancel(ctx), ep); e != nil {
                                log.G(ctx).Warnf("failed to recreate endpoint in store %s : %v", name, e)
                        }
                }
        }()

        if !n.getController().isSwarmNode() || n.Scope() != scope.Swarm || !n.driverIsMultihost() {
                n.updateSvcRecord(context.WithoutCancel(ctx), ep, false)
        }

        if err = ep.deleteEndpoint(force); err != nil && !force {
                return err
        }

        ep.releaseAddress()

        return nil
}

func (ep *Endpoint) deleteEndpoint(force bool) error {
        ep.mu.Lock()
        n := ep.network
        name := ep.name
        epid := ep.id
        ep.mu.Unlock()

        driver, err := n.driver(!force)
        if err != nil {
                return fmt.Errorf("failed to delete endpoint: %v", err)
        }

        if driver == nil {
                return nil
        }

        if err := driver.DeleteEndpoint(n.id, epid); err != nil {
                if _, ok := err.(types.ForbiddenError); ok {
                        return err
                }

                if _, ok := err.(types.MaskableError); !ok {
                        log.G(context.TODO()).Warnf("driver error deleting endpoint %s : %v", name, err)
                }
        }

        return nil
}

func (ep *Endpoint) getSandbox() (*Sandbox, bool) {
        c := ep.network.getController()
        ep.mu.Lock()
        sid := ep.sandboxID
        ep.mu.Unlock()

        c.mu.Lock()
        ps, ok := c.sandboxes[sid]
        c.mu.Unlock()

        return ps, ok
}

// Return a list of this endpoint's addresses to add to '/etc/hosts'.
func (ep *Endpoint) getEtcHostsAddrs() []netip.Addr {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        // Do not update hosts file with internal network's endpoint IP
        if n := ep.network; n == nil || n.ingress || n.Name() == libnGWNetwork {
                return nil
        }

        var addresses []netip.Addr
        if ep.iface.addr != nil {
                if addr, ok := netip.AddrFromSlice(ep.iface.addr.IP); ok {
                        addresses = append(addresses, addr)
                }
        }
        if ep.iface.addrv6 != nil {
                if addr, ok := netip.AddrFromSlice(ep.iface.addrv6.IP); ok {
                        addresses = append(addresses, addr)
                }
        }
        return addresses
}

// EndpointOptionGeneric function returns an option setter for a Generic option defined
// in a Dictionary of Key-Value pair
func EndpointOptionGeneric(generic map[string]any) EndpointOption {
        return func(ep *Endpoint) {
                for k, v := range generic {
                        ep.generic[k] = v
                }
        }
}

var (
        linkLocalMask     = net.CIDRMask(16, 32)
        linkLocalMaskIPv6 = net.CIDRMask(64, 128)
)

// CreateOptionIpam function returns an option setter for the ipam configuration for this endpoint
func CreateOptionIpam(ipV4, ipV6 net.IP, llIPs []net.IP, ipamOptions map[string]string) EndpointOption {
        return func(ep *Endpoint) {
                ep.prefAddress = ipV4
                ep.prefAddressV6 = ipV6
                if len(llIPs) != 0 {
                        for _, ip := range llIPs {
                                nw := &net.IPNet{IP: ip, Mask: linkLocalMask}
                                if ip.To4() == nil {
                                        nw.Mask = linkLocalMaskIPv6
                                }
                                ep.iface.llAddrs = append(ep.iface.llAddrs, nw)
                        }
                }
                ep.ipamOptions = ipamOptions
        }
}

// CreateOptionExposedPorts function returns an option setter for the container exposed
// ports option to be passed to [Network.CreateEndpoint] method.
func CreateOptionExposedPorts(exposedPorts []types.TransportPort) EndpointOption {
        return func(ep *Endpoint) {
                // Defensive copy
                eps := make([]types.TransportPort, len(exposedPorts))
                copy(eps, exposedPorts)
                // Store endpoint label and in generic because driver needs it
                ep.exposedPorts = eps
                ep.generic[netlabel.ExposedPorts] = eps
        }
}

// CreateOptionPortMapping function returns an option setter for the mapping
// ports option to be passed to [Network.CreateEndpoint] method.
func CreateOptionPortMapping(portBindings []types.PortBinding) EndpointOption {
        return func(ep *Endpoint) {
                // Store a copy of the bindings as generic data to pass to the driver
                pbs := make([]types.PortBinding, len(portBindings))
                copy(pbs, portBindings)
                ep.generic[netlabel.PortMap] = pbs
        }
}

// CreateOptionDNS function returns an option setter for dns entry option to
// be passed to container Create method.
func CreateOptionDNS(dns []string) EndpointOption {
        return func(ep *Endpoint) {
                ep.generic[netlabel.DNSServers] = dns
        }
}

// CreateOptionDNSNames specifies the list of (non fully qualified) DNS names associated to an endpoint. These will be
// used to populate the embedded DNS server. Order matters: first name will be used to generate PTR records.
func CreateOptionDNSNames(names []string) EndpointOption {
        return func(ep *Endpoint) {
                ep.dnsNames = names
        }
}

// CreateOptionDisableResolution function returns an option setter to indicate
// this endpoint doesn't want embedded DNS server functionality
func CreateOptionDisableResolution() EndpointOption {
        return func(ep *Endpoint) {
                ep.disableResolution = true
        }
}

// CreateOptionDisableIPv6 prevents allocation of an IPv6 address/gateway, even
// if the container is connected to an IPv6-enabled network.
func CreateOptionDisableIPv6() EndpointOption {
        return func(ep *Endpoint) {
                ep.disableIPv6 = true
        }
}

// CreateOptionAlias function returns an option setter for setting endpoint alias
func CreateOptionAlias(name string, alias string) EndpointOption {
        return func(ep *Endpoint) {
                if ep.aliases == nil {
                        ep.aliases = make(map[string]string)
                }
                ep.aliases[alias] = name
        }
}

// CreateOptionService function returns an option setter for setting service binding configuration
func CreateOptionService(name, id string, vip net.IP, ingressPorts []*PortConfig, aliases []string) EndpointOption {
        return func(ep *Endpoint) {
                ep.svcName = name
                ep.svcID = id
                ep.virtualIP = vip
                ep.ingressPorts = ingressPorts
                ep.svcAliases = aliases
        }
}

// CreateOptionLoadBalancer function returns an option setter for denoting the endpoint is a load balancer for a network
func CreateOptionLoadBalancer() EndpointOption {
        return func(ep *Endpoint) {
                ep.loadBalancer = true
        }
}

// JoinOptionPriority function returns an option setter for priority option to
// be passed to the endpoint.Join() method.
func JoinOptionPriority(prio int) EndpointOption {
        return func(ep *Endpoint) {
                // ep lock already acquired
                c := ep.network.getController()
                c.mu.Lock()
                sb, ok := c.sandboxes[ep.sandboxID]
                c.mu.Unlock()
                if !ok {
                        log.G(context.TODO()).Errorf("Could not set endpoint priority value during Join to endpoint %s: No sandbox id present in endpoint", ep.id)
                        return
                }
                sb.epPriority[ep.id] = prio
        }
}

func WithNetnsPath(path string) EndpointOption {
        return func(ep *Endpoint) {
                ep.iface.netnsPath = path
        }
}

func (ep *Endpoint) assignAddress(ipam ipamapi.Ipam, assignIPv4, assignIPv6 bool) error {
        n := ep.getNetwork()
        if n.hasSpecialDriver() {
                return nil
        }

        log.G(context.TODO()).Debugf("Assigning addresses for endpoint %s's interface on network %s", ep.Name(), n.Name())

        if assignIPv4 {
                if err := ep.assignAddressVersion(4, ipam); err != nil {
                        return err
                }
        }

        if assignIPv6 {
                if err := ep.assignAddressVersion(6, ipam); err != nil {
                        return err
                }
        }

        return nil
}

func (ep *Endpoint) assignAddressVersion(ipVer int, ipam ipamapi.Ipam) error {
        var (
                poolID  *string
                address **net.IPNet
                prefAdd net.IP
                progAdd net.IP
        )

        n := ep.getNetwork()
        switch ipVer {
        case 4:
                poolID = &ep.iface.v4PoolID
                address = &ep.iface.addr
                prefAdd = ep.prefAddress
        case 6:
                poolID = &ep.iface.v6PoolID
                address = &ep.iface.addrv6
                prefAdd = ep.prefAddressV6
        default:
                return types.InternalErrorf("incorrect ip version number passed: %d", ipVer)
        }

        ipInfo := n.getIPInfo(ipVer)
        if len(ipInfo) == 0 {
                return fmt.Errorf("no IPv%d information available for endpoint %s", ipVer, ep.Name())
        }

        // The address to program may be chosen by the user or by the network driver in one specific
        // case to support backward compatibility with `docker daemon --fixed-cidrv6` use case
        if prefAdd != nil {
                progAdd = prefAdd
        } else if *address != nil {
                progAdd = (*address).IP
        }

        for _, d := range ipInfo {
                if progAdd != nil && !d.Pool.Contains(progAdd) {
                        continue
                }
                addr, _, err := ipam.RequestAddress(d.PoolID, progAdd, ep.ipamOptions)
                if err == nil {
                        ep.mu.Lock()
                        *address = addr
                        *poolID = d.PoolID
                        ep.mu.Unlock()
                        return nil
                }
                if !errors.Is(err, ipamapi.ErrNoAvailableIPs) || progAdd != nil {
                        return err
                }
        }
        if progAdd != nil {
                return types.InvalidParameterErrorf("invalid address %s: It does not belong to any of this network's subnets", prefAdd)
        }
        return fmt.Errorf("no available IPv%d addresses on this network's address pools: %s (%s)", ipVer, n.Name(), n.ID())
}

func (ep *Endpoint) releaseAddress() {
        n := ep.getNetwork()
        if n.hasSpecialDriver() {
                return
        }

        log.G(context.TODO()).Debugf("Releasing addresses for endpoint %s's interface on network %s", ep.Name(), n.Name())

        ipam, _, err := n.getController().getIPAMDriver(n.ipamType)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to retrieve ipam driver to release interface address on delete of endpoint %s (%s): %v", ep.Name(), ep.ID(), err)
                return
        }

        if ep.iface.addr != nil {
                if err := ipam.ReleaseAddress(ep.iface.v4PoolID, ep.iface.addr.IP); err != nil {
                        log.G(context.TODO()).Warnf("Failed to release ip address %s on delete of endpoint %s (%s): %v", ep.iface.addr.IP, ep.Name(), ep.ID(), err)
                }
        }

        if ep.iface.addrv6 != nil {
                if err := ipam.ReleaseAddress(ep.iface.v6PoolID, ep.iface.addrv6.IP); err != nil {
                        log.G(context.TODO()).Warnf("Failed to release ip address %s on delete of endpoint %s (%s): %v", ep.iface.addrv6.IP, ep.Name(), ep.ID(), err)
                }
        }
}

func (c *Controller) cleanupLocalEndpoints() error {
        // Get used endpoints
        eps := make(map[string]any)
        for _, sb := range c.sandboxes {
                for _, ep := range sb.endpoints {
                        eps[ep.id] = true
                }
        }
        nl, err := c.getNetworks()
        if err != nil {
                return fmt.Errorf("could not get list of networks: %v", err)
        }

        for _, n := range nl {
                if n.ConfigOnly() {
                        continue
                }
                epl, err := n.getEndpointsFromStore()
                if err != nil {
                        log.G(context.TODO()).Warnf("Could not get list of endpoints in network %s during endpoint cleanup: %v", n.name, err)
                        continue
                }

                for _, ep := range epl {
                        if _, ok := eps[ep.id]; ok {
                                continue
                        }
                        log.G(context.TODO()).Infof("Removing stale endpoint %s (%s)", ep.name, ep.id)
                        if err := ep.Delete(context.WithoutCancel(context.TODO()), true); err != nil {
                                log.G(context.TODO()).Warnf("Could not delete local endpoint %s during endpoint cleanup: %v", ep.name, err)
                        }
                }
        }

        return nil
}

package libnetwork

import (
        "encoding/json"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/datastore"
)

// endpointCnt was used to refcount network-endpoint relationships. It's
// unused since v28.1, and kept around only to ensure that users can properly
// downgrade.
//
// TODO(aker): remove this struct in v30.
type endpointCnt struct {
        n        *Network
        Count    uint64
        dbIndex  uint64
        dbExists bool
        sync.Mutex
}

const epCntKeyPrefix = "endpoint_count"

func (ec *endpointCnt) Key() []string {
        ec.Lock()
        defer ec.Unlock()

        return []string{epCntKeyPrefix, ec.n.id}
}

func (ec *endpointCnt) KeyPrefix() []string {
        ec.Lock()
        defer ec.Unlock()

        return []string{epCntKeyPrefix, ec.n.id}
}

func (ec *endpointCnt) Value() []byte {
        ec.Lock()
        defer ec.Unlock()

        b, err := json.Marshal(ec)
        if err != nil {
                return nil
        }
        return b
}

func (ec *endpointCnt) SetValue(value []byte) error {
        ec.Lock()
        defer ec.Unlock()

        return json.Unmarshal(value, &ec)
}

func (ec *endpointCnt) Index() uint64 {
        ec.Lock()
        defer ec.Unlock()
        return ec.dbIndex
}

func (ec *endpointCnt) SetIndex(index uint64) {
        ec.Lock()
        ec.dbIndex = index
        ec.dbExists = true
        ec.Unlock()
}

func (ec *endpointCnt) Exists() bool {
        ec.Lock()
        defer ec.Unlock()
        return ec.dbExists
}

func (ec *endpointCnt) Skip() bool {
        ec.Lock()
        defer ec.Unlock()
        return !ec.n.persist
}

func (ec *endpointCnt) New() datastore.KVObject {
        ec.Lock()
        defer ec.Unlock()

        return &endpointCnt{
                n: ec.n,
        }
}

func (ec *endpointCnt) CopyTo(o datastore.KVObject) error {
        ec.Lock()
        defer ec.Unlock()

        dstEc := o.(*endpointCnt)
        dstEc.n = ec.n
        dstEc.Count = ec.Count
        dstEc.dbExists = ec.dbExists
        dstEc.dbIndex = ec.dbIndex

        return nil
}

package libnetwork

import (
        "encoding/json"
        "fmt"
        "net"

        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/types"
)

// EndpointInfo provides an interface to retrieve network resources bound to the endpoint.
type EndpointInfo interface {
        // Iface returns information about the interface which was assigned to
        // the endpoint by the driver. This can be used after the
        // endpoint has been created.
        Iface() *EndpointInterface

        // Gateway returns the IPv4 gateway assigned by the driver.
        // This will only return a valid value if a container has joined the endpoint.
        Gateway() net.IP

        // GatewayIPv6 returns the IPv6 gateway assigned by the driver.
        // This will only return a valid value if a container has joined the endpoint.
        GatewayIPv6() net.IP

        // StaticRoutes returns the list of static routes configured by the network
        // driver when the container joins a network
        StaticRoutes() []*types.StaticRoute

        // Sandbox returns the attached sandbox if there, nil otherwise.
        Sandbox() *Sandbox

        // LoadBalancer returns whether the endpoint is the load balancer endpoint for the network.
        LoadBalancer() bool
}

// EndpointInterface holds interface addresses bound to the endpoint.
type EndpointInterface struct {
        mac                net.HardwareAddr
        addr               *net.IPNet
        addrv6             *net.IPNet
        llAddrs            []*net.IPNet
        srcName            string
        dstPrefix          string
        dstName            string // dstName is the name of the interface in the container namespace. It takes precedence over dstPrefix.
        routes             []*net.IPNet
        v4PoolID           string
        v6PoolID           string
        netnsPath          string
        createdInContainer bool
}

func (epi *EndpointInterface) MarshalJSON() ([]byte, error) {
        epMap := make(map[string]interface{})
        if epi.mac != nil {
                epMap["mac"] = epi.mac.String()
        }
        if epi.addr != nil {
                epMap["addr"] = epi.addr.String()
        }
        if epi.addrv6 != nil {
                epMap["addrv6"] = epi.addrv6.String()
        }
        if len(epi.llAddrs) != 0 {
                list := make([]string, 0, len(epi.llAddrs))
                for _, ll := range epi.llAddrs {
                        list = append(list, ll.String())
                }
                epMap["llAddrs"] = list
        }
        epMap["srcName"] = epi.srcName
        epMap["dstPrefix"] = epi.dstPrefix
        epMap["dstName"] = epi.dstName
        var routes []string
        for _, route := range epi.routes {
                routes = append(routes, route.String())
        }
        epMap["routes"] = routes
        epMap["v4PoolID"] = epi.v4PoolID
        epMap["v6PoolID"] = epi.v6PoolID
        epMap["createdInContainer"] = epi.createdInContainer
        return json.Marshal(epMap)
}

func (epi *EndpointInterface) UnmarshalJSON(b []byte) error {
        var (
                err   error
                epMap map[string]interface{}
        )
        if err = json.Unmarshal(b, &epMap); err != nil {
                return err
        }
        if v, ok := epMap["mac"]; ok {
                if epi.mac, err = net.ParseMAC(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode endpoint interface mac address after json unmarshal: %s", v.(string))
                }
        }
        if v, ok := epMap["addr"]; ok {
                if epi.addr, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode endpoint interface ipv4 address after json unmarshal: %v", err)
                }
        }
        if v, ok := epMap["addrv6"]; ok {
                if epi.addrv6, err = types.ParseCIDR(v.(string)); err != nil {
                        return types.InternalErrorf("failed to decode endpoint interface ipv6 address after json unmarshal: %v", err)
                }
        }
        if v, ok := epMap["llAddrs"]; ok {
                list := v.([]interface{})
                epi.llAddrs = make([]*net.IPNet, 0, len(list))
                for _, llS := range list {
                        ll, err := types.ParseCIDR(llS.(string))
                        if err != nil {
                                return types.InternalErrorf("failed to decode endpoint interface link-local address (%v) after json unmarshal: %v", llS, err)
                        }
                        epi.llAddrs = append(epi.llAddrs, ll)
                }
        }
        epi.srcName = epMap["srcName"].(string)
        epi.dstPrefix = epMap["dstPrefix"].(string)

        // TODO(cpuguy83): linter noticed we don't check the error here... no idea why but it seems like it could introduce problems if we start checking
        rb, _ := json.Marshal(epMap["routes"]) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
        var routes []string
        _ = json.Unmarshal(rb, &routes) //nolint:errcheck

        epi.routes = make([]*net.IPNet, 0)
        for _, route := range routes {
                ip, ipr, err := net.ParseCIDR(route)
                if err == nil {
                        ipr.IP = ip
                        epi.routes = append(epi.routes, ipr)
                }
        }
        epi.v4PoolID = epMap["v4PoolID"].(string)
        epi.v6PoolID = epMap["v6PoolID"].(string)

        if v, ok := epMap["createdInContainer"]; ok {
                epi.createdInContainer = v.(bool)
        }
        return nil
}

func (epi *EndpointInterface) CopyTo(dstEpi *EndpointInterface) error {
        dstEpi.mac = types.GetMacCopy(epi.mac)
        dstEpi.addr = types.GetIPNetCopy(epi.addr)
        dstEpi.addrv6 = types.GetIPNetCopy(epi.addrv6)
        dstEpi.srcName = epi.srcName
        dstEpi.dstPrefix = epi.dstPrefix
        dstEpi.dstName = epi.dstName
        dstEpi.v4PoolID = epi.v4PoolID
        dstEpi.v6PoolID = epi.v6PoolID
        dstEpi.createdInContainer = epi.createdInContainer
        if len(epi.llAddrs) != 0 {
                dstEpi.llAddrs = make([]*net.IPNet, 0, len(epi.llAddrs))
                dstEpi.llAddrs = append(dstEpi.llAddrs, epi.llAddrs...)
        }

        for _, route := range epi.routes {
                dstEpi.routes = append(dstEpi.routes, types.GetIPNetCopy(route))
        }

        return nil
}

type endpointJoinInfo struct {
        gw                    net.IP
        gw6                   net.IP
        StaticRoutes          []*types.StaticRoute
        driverTableEntries    []*tableEntry
        disableGatewayService bool
}

type tableEntry struct {
        tableName string
        key       string
        value     []byte
}

// Info hydrates the endpoint and returns certain operational data belonging
// to this endpoint.
//
// TODO(thaJeztah): make sure that Endpoint is always fully hydrated, and remove the EndpointInfo interface, and use Endpoint directly.
func (ep *Endpoint) Info() EndpointInfo {
        if ep.sandboxID != "" {
                return ep
        }
        n, err := ep.getNetworkFromStore()
        if err != nil {
                return nil
        }

        ep, err = n.getEndpointFromStore(ep.ID())
        if err != nil {
                return nil
        }

        sb, ok := ep.getSandbox()
        if !ok {
                // endpoint hasn't joined any sandbox.
                // Just return the endpoint
                return ep
        }

        return sb.GetEndpoint(ep.ID())
}

// Iface returns information about the interface which was assigned to
// the endpoint by the driver. This can be used after the
// endpoint has been created.
func (ep *Endpoint) Iface() *EndpointInterface {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        return ep.iface
}

// SetMacAddress allows the driver to set the mac address to the endpoint interface
// during the call to CreateEndpoint, if the mac address is not already set.
func (epi *EndpointInterface) SetMacAddress(mac net.HardwareAddr) error {
        if epi.mac != nil {
                return types.ForbiddenErrorf("endpoint interface MAC address present (%s). Cannot be modified with %s.", epi.mac, mac)
        }
        if mac == nil {
                return types.InvalidParameterErrorf("tried to set nil MAC address to endpoint interface")
        }
        epi.mac = types.GetMacCopy(mac)
        return nil
}

func (epi *EndpointInterface) SetIPAddress(address *net.IPNet) error {
        if address.IP == nil {
                return types.InvalidParameterErrorf("tried to set nil IP address to endpoint interface")
        }
        if address.IP.To4() == nil {
                return setAddress(&epi.addrv6, address)
        }
        return setAddress(&epi.addr, address)
}

func setAddress(ifaceAddr **net.IPNet, address *net.IPNet) error {
        if *ifaceAddr != nil {
                return types.ForbiddenErrorf("endpoint interface IP present (%s). Cannot be modified with (%s).", *ifaceAddr, address)
        }
        *ifaceAddr = types.GetIPNetCopy(address)
        return nil
}

// MacAddress returns the MAC address assigned to the endpoint.
func (epi *EndpointInterface) MacAddress() net.HardwareAddr {
        return types.GetMacCopy(epi.mac)
}

// Address returns the IPv4 address assigned to the endpoint.
func (epi *EndpointInterface) Address() *net.IPNet {
        return types.GetIPNetCopy(epi.addr)
}

// AddressIPv6 returns the IPv6 address assigned to the endpoint.
func (epi *EndpointInterface) AddressIPv6() *net.IPNet {
        return types.GetIPNetCopy(epi.addrv6)
}

// LinkLocalAddresses returns the list of link-local (IPv4/IPv6) addresses assigned to the endpoint.
func (epi *EndpointInterface) LinkLocalAddresses() []*net.IPNet {
        return epi.llAddrs
}

// SrcName returns the name of the interface w/in the container
func (epi *EndpointInterface) SrcName() string {
        return epi.srcName
}

// SetNames method assigns the srcName, dstName, and dstPrefix for the
// interface. If both dstName and dstPrefix are set, dstName takes precedence.
func (epi *EndpointInterface) SetNames(srcName, dstPrefix, dstName string) error {
        epi.srcName = srcName
        epi.dstPrefix = dstPrefix
        epi.dstName = dstName
        return nil
}

// NetnsPath returns the path of the network namespace, if there is one. Else "".
func (epi *EndpointInterface) NetnsPath() string {
        return epi.netnsPath
}

// SetCreatedInContainer can be called by the driver to indicate that it's
// created the network interface in the container's network namespace (so,
// it doesn't need to be moved there).
func (epi *EndpointInterface) SetCreatedInContainer(cic bool) {
        epi.createdInContainer = cic
}

func (ep *Endpoint) InterfaceName() driverapi.InterfaceNameInfo {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        return ep.iface
}

// AddStaticRoute adds a route to the sandbox.
// It may be used in addition to or instead of a default gateway (as above).
func (ep *Endpoint) AddStaticRoute(destination *net.IPNet, routeType int, nextHop net.IP) error {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        if routeType == types.NEXTHOP {
                // If the route specifies a next-hop, then it's loosely routed (i.e. not bound to a particular interface).
                ep.joinInfo.StaticRoutes = append(ep.joinInfo.StaticRoutes, &types.StaticRoute{
                        Destination: destination,
                        RouteType:   routeType,
                        NextHop:     nextHop,
                })
        } else {
                // If the route doesn't specify a next-hop, it must be a connected route, bound to an interface.
                ep.iface.routes = append(ep.iface.routes, destination)
        }
        return nil
}

// AddTableEntry adds a table entry to the gossip layer
// passing the table name, key and an opaque value.
func (ep *Endpoint) AddTableEntry(tableName, key string, value []byte) error {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        ep.joinInfo.driverTableEntries = append(ep.joinInfo.driverTableEntries, &tableEntry{
                tableName: tableName,
                key:       key,
                value:     value,
        })

        return nil
}

// Sandbox returns the attached sandbox if there, nil otherwise.
func (ep *Endpoint) Sandbox() *Sandbox {
        cnt, ok := ep.getSandbox()
        if !ok {
                return nil
        }
        return cnt
}

// LoadBalancer returns whether the endpoint is the load balancer endpoint for the network.
func (ep *Endpoint) LoadBalancer() bool {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        return ep.loadBalancer
}

// StaticRoutes returns the list of static routes configured by the network
// driver when the container joins a network
func (ep *Endpoint) StaticRoutes() []*types.StaticRoute {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        if ep.joinInfo == nil {
                return nil
        }

        return ep.joinInfo.StaticRoutes
}

// Gateway returns the IPv4 gateway assigned by the driver.
// This will only return a valid value if a container has joined the endpoint.
func (ep *Endpoint) Gateway() net.IP {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        if ep.joinInfo == nil {
                return net.IP{}
        }

        return types.GetIPCopy(ep.joinInfo.gw)
}

// GatewayIPv6 returns the IPv6 gateway assigned by the driver.
// This will only return a valid value if a container has joined the endpoint.
func (ep *Endpoint) GatewayIPv6() net.IP {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        if ep.joinInfo == nil {
                return net.IP{}
        }

        return types.GetIPCopy(ep.joinInfo.gw6)
}

// SetGateway sets the default IPv4 gateway when a container joins the endpoint.
func (ep *Endpoint) SetGateway(gw net.IP) error {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        ep.joinInfo.gw = types.GetIPCopy(gw)
        return nil
}

// SetGatewayIPv6 sets the default IPv6 gateway when a container joins the endpoint.
func (ep *Endpoint) SetGatewayIPv6(gw6 net.IP) error {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        ep.joinInfo.gw6 = types.GetIPCopy(gw6)
        return nil
}

// hasGatewayOrDefaultRoute returns true if ep has a gateway, or a route to '0.0.0.0'/'::'.
func (ep *Endpoint) hasGatewayOrDefaultRoute() (v4, v6 bool) {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        if ep.joinInfo != nil {
                v4 = len(ep.joinInfo.gw) > 0
                v6 = len(ep.joinInfo.gw6) > 0
                if !v4 || !v6 {
                        for _, route := range ep.joinInfo.StaticRoutes {
                                if route.Destination.IP.IsUnspecified() && net.IP(route.Destination.Mask).IsUnspecified() {
                                        if route.Destination.IP.To4() == nil {
                                                v6 = true
                                        } else {
                                                v4 = true
                                        }
                                }
                        }
                }
        }
        if ep.iface != nil && (!v4 || !v6) {
                for _, route := range ep.iface.routes {
                        if route.IP.IsUnspecified() && net.IP(route.Mask).IsUnspecified() {
                                if route.IP.To4() == nil {
                                        v6 = true
                                } else {
                                        v4 = true
                                }
                        }
                }
        }
        return v4, v6
}

func (ep *Endpoint) retrieveFromStore() (*Endpoint, error) {
        n, err := ep.getNetworkFromStore()
        if err != nil {
                return nil, fmt.Errorf("could not find network in store to get latest endpoint %s: %v", ep.Name(), err)
        }
        return n.getEndpointFromStore(ep.ID())
}

// DisableGatewayService tells libnetwork not to provide Default GW for the container
func (ep *Endpoint) DisableGatewayService() {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        ep.joinInfo.disableGatewayService = true
}

func (epj *endpointJoinInfo) MarshalJSON() ([]byte, error) {
        epMap := make(map[string]interface{})
        if epj.gw != nil {
                epMap["gw"] = epj.gw.String()
        }
        if epj.gw6 != nil {
                epMap["gw6"] = epj.gw6.String()
        }
        epMap["disableGatewayService"] = epj.disableGatewayService
        epMap["StaticRoutes"] = epj.StaticRoutes
        return json.Marshal(epMap)
}

func (epj *endpointJoinInfo) UnmarshalJSON(b []byte) error {
        var (
                err   error
                epMap map[string]interface{}
        )
        if err = json.Unmarshal(b, &epMap); err != nil {
                return err
        }
        if v, ok := epMap["gw"]; ok {
                epj.gw = net.ParseIP(v.(string))
        }
        if v, ok := epMap["gw6"]; ok {
                epj.gw6 = net.ParseIP(v.(string))
        }
        epj.disableGatewayService = epMap["disableGatewayService"].(bool)

        var tStaticRoute []types.StaticRoute
        if v, ok := epMap["StaticRoutes"]; ok {
                // TODO(cpuguy83): Linter caught that we aren't checking errors here
                // I don't know why we aren't other than potentially the data is not always expected to be right?
                // This is why I'm not adding the error check.
                //
                // In any case for posterity please if you figure this out document it or check the error
                tb, _ := json.Marshal(v)              //nolint:errchkjson // FIXME: handle json (Un)Marshal errors (see above)
                _ = json.Unmarshal(tb, &tStaticRoute) //nolint:errcheck
        }
        var StaticRoutes []*types.StaticRoute
        for _, r := range tStaticRoute {
                StaticRoutes = append(StaticRoutes, &r)
        }
        epj.StaticRoutes = StaticRoutes

        return nil
}

func (epj *endpointJoinInfo) CopyTo(dstEpj *endpointJoinInfo) error {
        dstEpj.disableGatewayService = epj.disableGatewayService
        dstEpj.StaticRoutes = make([]*types.StaticRoute, len(epj.StaticRoutes))
        copy(dstEpj.StaticRoutes, epj.StaticRoutes)
        dstEpj.driverTableEntries = make([]*tableEntry, len(epj.driverTableEntries))
        copy(dstEpj.driverTableEntries, epj.driverTableEntries)
        dstEpj.gw = types.GetIPCopy(epj.gw)
        dstEpj.gw6 = types.GetIPCopy(epj.gw6)
        return nil
}

//go:build !windows

package libnetwork

import "fmt"

// DriverInfo returns a collection of driver operational data related to this endpoint retrieved from the driver.
func (ep *Endpoint) DriverInfo() (map[string]interface{}, error) {
        ep, err := ep.retrieveFromStore()
        if err != nil {
                return nil, err
        }

        n, err := ep.getNetworkFromStore()
        if err != nil {
                return nil, fmt.Errorf("could not find network in store for driver info: %v", err)
        }

        driver, err := n.driver(true)
        if err != nil {
                return nil, fmt.Errorf("failed to get driver info: %v", err)
        }

        return driver.EndpointOperInfo(n.ID(), ep.ID())
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package libnetwork

import (
        "context"

        "github.com/docker/docker/daemon/libnetwork/internal/maputil"
)

// storeEndpoint inserts or updates the endpoint in the store and the in-memory
// cache maintained by the Controller.
//
// This method is thread-safe.
func (c *Controller) storeEndpoint(ctx context.Context, ep *Endpoint) error {
        if err := c.updateToStore(ctx, ep); err != nil {
                return err
        }
        c.cacheEndpoint(ep)
        return nil
}

// deleteStoredEndpoint deletes the endpoint from the store and the in-memory
// cache maintained by the Controller.
//
// This method is thread-safe.
func (c *Controller) deleteStoredEndpoint(ep *Endpoint) error {
        if err := c.deleteFromStore(ep); err != nil {
                return err
        }

        c.endpointsMu.Lock()
        defer c.endpointsMu.Unlock()
        delete(c.endpoints, ep.id)

        return nil
}

// cacheEndpoint caches the endpoint in the in-memory cache of endpoints
// maintained by the Controller.
//
// This method is thread-safe.
func (c *Controller) cacheEndpoint(ep *Endpoint) {
        c.endpointsMu.Lock()
        defer c.endpointsMu.Unlock()
        c.endpoints[ep.id] = ep
}

// findEndpoints looks for all endpoints matching the filter from the in-memory
// cache of endpoints maintained by the Controller.
//
// This method is thread-safe, but do not use it unless you're sure your code
// uses the returned endpoints in thread-safe way (see the comment on
// Controller.endpoints).
func (c *Controller) findEndpoints(filter func(ep *Endpoint) bool) []*Endpoint {
        c.endpointsMu.Lock()
        defer c.endpointsMu.Unlock()
        return maputil.FilterValues(c.endpoints, filter)
}

func filterEndpointByNetworkId(expected string) func(ep *Endpoint) bool {
        return func(ep *Endpoint) bool {
                return ep.network != nil && ep.network.id == expected
        }
}

package libnetwork

import (
        "fmt"
        "strings"
)

// ErrNoSuchNetwork is returned when a network query finds no result
type ErrNoSuchNetwork string

func (nsn ErrNoSuchNetwork) Error() string {
        return fmt.Sprintf("network %s not found", string(nsn))
}

// NotFound denotes the type of this error
func (nsn ErrNoSuchNetwork) NotFound() {}

// NetworkNameError is returned when a network with the same name already exists.
type NetworkNameError string

func (nnr NetworkNameError) Error() string {
        return fmt.Sprintf("network with name %s already exists", string(nnr))
}

// Conflict denotes the type of this error
func (nnr NetworkNameError) Conflict() {}

// ActiveEndpointsError is returned when a network is deleted which has active
// endpoints in it.
type ActiveEndpointsError struct {
        name      string
        id        string
        endpoints []string
}

func (aee *ActiveEndpointsError) Error() string {
        return fmt.Sprintf("network %s has active endpoints (%s)", aee.name, strings.Join(aee.endpoints, ", "))
}

// Forbidden denotes the type of this error
func (aee *ActiveEndpointsError) Forbidden() {}

// ActiveContainerError is returned when an endpoint is deleted which has active
// containers attached to it.
type ActiveContainerError struct {
        name string
        id   string
}

func (ace *ActiveContainerError) Error() string {
        return fmt.Sprintf("endpoint with name %s id %s has active containers", ace.name, ace.id)
}

// Forbidden denotes the type of this error
func (ace *ActiveContainerError) Forbidden() {}

// ManagerRedirectError is returned when the request should be redirected to Manager
type ManagerRedirectError string

func (mr ManagerRedirectError) Error() string {
        return "Redirect the request to the manager"
}

// Maskable denotes the type of this error
func (mr ManagerRedirectError) Maskable() {}

package etchosts

import (
        "bufio"
        "bytes"
        "fmt"
        "io"
        "net/netip"
        "os"
        "regexp"
        "strings"
        "sync"
)

// Record Structure for a single host record
type Record struct {
        Hosts string
        IP    netip.Addr
}

// WriteTo writes record to file and returns bytes written or error
func (r Record) WriteTo(w io.Writer) (int64, error) {
        n, err := fmt.Fprintf(w, "%s\t%s\n", r.IP, r.Hosts)
        return int64(n), err
}

var (
        // Default hosts config records slice
        defaultContentIPv4 = []Record{
                {Hosts: "localhost", IP: netip.MustParseAddr("127.0.0.1")},
        }
        defaultContentIPv6 = []Record{
                {Hosts: "localhost ip6-localhost ip6-loopback", IP: netip.IPv6Loopback()},
                {Hosts: "ip6-localnet", IP: netip.MustParseAddr("fe00::")},
                {Hosts: "ip6-mcastprefix", IP: netip.MustParseAddr("ff00::")},
                {Hosts: "ip6-allnodes", IP: netip.MustParseAddr("ff02::1")},
                {Hosts: "ip6-allrouters", IP: netip.MustParseAddr("ff02::2")},
        }

        // A cache of path level locks for synchronizing /etc/hosts
        // updates on a file level
        pathMap = make(map[string]*sync.Mutex)

        // A package level mutex to synchronize the cache itself
        pathMutex sync.Mutex
)

func pathLock(path string) func() {
        pathMutex.Lock()
        defer pathMutex.Unlock()

        pl, ok := pathMap[path]
        if !ok {
                pl = &sync.Mutex{}
                pathMap[path] = pl
        }

        pl.Lock()
        return func() {
                pl.Unlock()
        }
}

// Drop drops the path string from the path cache
func Drop(path string) {
        pathMutex.Lock()
        defer pathMutex.Unlock()

        delete(pathMap, path)
}

// Build function
// path is path to host file string required
// extraContent is an array of extra host records.
func Build(path string, extraContent []Record) error {
        return build(path, defaultContentIPv4, defaultContentIPv6, extraContent)
}

// BuildNoIPv6 is the same as Build, but will not include IPv6 entries.
func BuildNoIPv6(path string, extraContent []Record) error {
        var ipv4ExtraContent []Record
        for _, rec := range extraContent {
                if !rec.IP.Is6() {
                        ipv4ExtraContent = append(ipv4ExtraContent, rec)
                }
        }
        return build(path, defaultContentIPv4, ipv4ExtraContent)
}

func build(path string, contents ...[]Record) error {
        defer pathLock(path)()

        buf := bytes.NewBuffer(nil)

        // Write content from function arguments
        for _, content := range contents {
                for _, c := range content {
                        if _, err := c.WriteTo(buf); err != nil {
                                return err
                        }
                }
        }

        return os.WriteFile(path, buf.Bytes(), 0o644)
}

// Add adds an arbitrary number of Records to an already existing /etc/hosts file
func Add(path string, recs []Record) error {
        if len(recs) == 0 {
                return nil
        }

        defer pathLock(path)()

        content := bytes.NewBuffer(nil)
        for _, r := range recs {
                if _, err := r.WriteTo(content); err != nil {
                        return err
                }
        }

        f, err := os.OpenFile(path, os.O_APPEND|os.O_WRONLY, 0o644)
        if err != nil {
                return err
        }
        _, err = f.Write(content.Bytes())
        _ = f.Close()
        return err
}

// Delete deletes Records from /etc/hosts.
// The hostnames must be an exact match (if the user has modified the record,
// it won't be deleted). The address, parsed as a netip.Addr must also match
// the value in recs.
func Delete(path string, recs []Record) error {
        if len(recs) == 0 {
                return nil
        }
        defer pathLock(path)()
        f, err := os.OpenFile(path, os.O_RDWR, 0o644)
        if err != nil {
                return err
        }
        defer f.Close()

        var buf bytes.Buffer

        s := bufio.NewScanner(f)
        eol := []byte{'\n'}
loop:
        for s.Scan() {
                b := s.Bytes()
                if len(b) == 0 {
                        continue
                }

                if b[0] == '#' {
                        buf.Write(b)
                        buf.Write(eol)
                        continue
                }
                for _, r := range recs {
                        if before, found := strings.CutSuffix(string(b), "\t"+r.Hosts); found {
                                if addr, err := netip.ParseAddr(strings.TrimSpace(before)); err == nil && addr == r.IP {
                                        continue loop
                                }
                        }
                }
                buf.Write(b)
                buf.Write(eol)
        }
        if err := s.Err(); err != nil {
                return err
        }
        if err := f.Truncate(0); err != nil {
                return err
        }
        _, err = f.WriteAt(buf.Bytes(), 0)
        return err
}

// Update all IP addresses where hostname matches.
// path is path to host file
// IP is new IP address
// hostname is hostname to search for to replace IP
func Update(path, IP, hostname string) error {
        re, err := regexp.Compile(fmt.Sprintf(`(\S*)(\t%s)(\s|\.)`, regexp.QuoteMeta(hostname)))
        if err != nil {
                return err
        }
        defer pathLock(path)()

        old, err := os.ReadFile(path)
        if err != nil {
                return err
        }
        return os.WriteFile(path, re.ReplaceAll(old, []byte(IP+"$2"+"$3")), 0o644)
}

package libnetwork

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
        "github.com/docker/docker/daemon/libnetwork/iptables"
)

const userChain = "DOCKER-USER"

func (c *Controller) selectFirewallBackend() error {
        // If explicitly configured to use iptables, don't consider nftables.
        if c.cfg.FirewallBackend == "iptables" {
                return nil
        }
        // If configured to use nftables, but it can't be initialised, return an error.
        if c.cfg.FirewallBackend == "nftables" {
                if err := nftables.Enable(); err != nil {
                        return fmt.Errorf("firewall-backend is set to nftables: %v", err)
                }
                return nil
        }
        return nil
}

// Sets up the DOCKER-USER chain for each iptables version (IPv4, IPv6) that's
// enabled in the controller's configuration.
func (c *Controller) setupUserChains() {
        // There's no equivalent to DOCKER-USER in the nftables implementation.
        if nftables.Enabled() {
                return
        }

        setup := func() error {
                var errs []error
                for _, ipVersion := range c.enabledIptablesVersions() {
                        errs = append(errs, setupUserChain(ipVersion))
                }
                return errors.Join(errs...)
        }
        if err := setup(); err != nil {
                log.G(context.Background()).WithError(err).Warn("configuring " + userChain)
        }
        iptables.OnReloaded(func() {
                if err := setup(); err != nil {
                        log.G(context.Background()).WithError(err).Warn("configuring " + userChain + " on firewall reload")
                }
        })
}

// setupUserChain sets up the DOCKER-USER chain for the given [iptables.IPVersion].
//
// This chain allows users to configure firewall policies in a way that
// persist daemon operations/restarts. The daemon does not delete or modify
// any pre-existing rules from the DOCKER-USER filter chain.
//
// Once the DOCKER-USER chain is created, the daemon does not remove it when
// IPTableForwarding is disabled, because it contains rules configured by user
// that are beyond the daemon's control.
func setupUserChain(ipVersion iptables.IPVersion) error {
        ipt := iptables.GetIptable(ipVersion)
        if _, err := ipt.NewChain(userChain, iptables.Filter); err != nil {
                return fmt.Errorf("failed to create %s %v chain: %v", userChain, ipVersion, err)
        }
        if err := ipt.EnsureJumpRule("FORWARD", userChain); err != nil {
                return fmt.Errorf("failed to ensure the jump rule for %s %v: %w", userChain, ipVersion, err)
        }
        return nil
}

// Package addrset implements a set of IP addresses.
package addrset

import (
        "errors"
        "fmt"
        "net"
        "net/netip"
        "strings"

        "github.com/docker/docker/daemon/libnetwork/bitmap"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/ipbits"
)

var (
        // ErrNotAvailable is returned when no more addresses are available to set
        ErrNotAvailable = errors.New("address not available")
        // ErrAllocated is returned when the specific address requested is already allocated
        ErrAllocated = errors.New("address already allocated")
)

const (
        // maxBitsPerBitmap is the max size for a single bitmap in the address set.
        //
        // [bitmap.Bitmap] is initialised with a uint64 num-bits. So, it can't contain
        // enough bits for a 64-bit range (it's one bit short, the last address in the
        // range can't be represented). If that's fixed, this max can be increased, but
        // addrsPerBitmap() will need updating to deal with the overflow.
        //
        // A max of 63-bits means a 64-bit address range (the norm for IPv6) is
        // represented by up-to two bitmaps.
        maxBitsPerBitmap = 63
        // minPrefixLen is the prefix length corresponding to maxBitsPerBitmap
        minPrefixLen = (net.IPv6len * 8) - maxBitsPerBitmap
)

// AddrSet is a set of IP addresses.
type AddrSet struct {
        pool    netip.Prefix
        bitmaps map[netip.Prefix]*bitmap.Bitmap
}

// New returns an AddrSet for the range of addresses in pool.
func New(pool netip.Prefix) *AddrSet {
        return &AddrSet{
                pool:    pool.Masked(),
                bitmaps: map[netip.Prefix]*bitmap.Bitmap{},
        }
}

// Add adds address addr to the set. If addr is already in the set, it returns a
// wrapped [ErrAllocated]. If addr is not in the set's address range, it returns
// an error.
func (as *AddrSet) Add(addr netip.Addr) error {
        if !as.pool.Contains(addr) {
                return fmt.Errorf("cannot add %s to '%s'", addr, as.pool)
        }
        bm, _, err := as.getBitmap(addr)
        if err != nil {
                return fmt.Errorf("finding bitmap for %s in '%s': %w", addr, as.pool, err)
        }
        bit := netiputil.HostID(addr, as.prefixLenPerBitmap())
        if err := bm.Set(bit); err != nil {
                return fmt.Errorf("setting bit %d for %s in pool '%s': %w", bit, addr, as.pool, mapErr(err))
        }
        return nil
}

// AddAny adds an arbitrary address to the set, and returns that address. Or, if
// no addresses are available, it returns a wrapped [ErrNotAvailable].
//
// If the address set's pool contains fewer than 1<<maxBitsPerBitmap addresses,
// AddAny will add any address from the entire set. If the pool is bigger than
// that, AddAny will only consider the first 1<<maxBitsPerBitmap addresses. If
// those are all allocated, it returns [ErrNotAvailable].
//
// When serial=true, the set is scanned starting from the address following
// the address most recently set by [AddrSet.AddAny] (or [AddrSet.AddAnyInRange]
// if the range is in the same 1<<maxBitsPerBitmap .
func (as *AddrSet) AddAny(serial bool) (netip.Addr, error) {
        // Only look at the first bitmap. It either contains the whole address range or
        // the first 1<<maxBitsPerBitmap addresses, which is a lot. (So, no need to
        // search other bitmaps, or work out if more bitmaps could be created).
        bm, _, err := as.getBitmap(as.pool.Addr())
        if err != nil {
                return netip.Addr{}, fmt.Errorf("no bitmap to add-any to '%s': %w", as.pool.Addr(), err)
        }
        ordinal, err := bm.SetAny(serial)
        if err != nil {
                return netip.Addr{}, fmt.Errorf("add-any to '%s': %w", as.pool.Addr(), mapErr(err))
        }
        return ipbits.Add(as.pool.Addr(), ordinal, 0), nil
}

// AddAnyInRange adds an arbitrary address from ipr to the set, and returns that
// address. Or, if no addresses are available, it returns a wrapped [ErrNotAvailable].
// If ipr is not fully contained within the set's range, it returns an error.
//
// When serial=true, the set is scanned starting from the address following
// the address most recently set by [AddrSet.AddAny] or [AddrSet.AddAnyInRange].
func (as *AddrSet) AddAnyInRange(ipr netip.Prefix, serial bool) (netip.Addr, error) {
        if !as.pool.Contains(ipr.Addr()) || ipr.Bits() < as.pool.Bits() {
                return netip.Addr{}, fmt.Errorf("add-any, range '%s' is not in subnet '%s'", ipr, as.pool)
        }
        iprMasked := ipr.Masked()
        bm, bmKey, err := as.getBitmap(iprMasked.Addr())
        if err != nil {
                return netip.Addr{}, fmt.Errorf("no bitmap to add-any in '%s' range '%s': %w", as.pool, ipr, err)
        }
        var ordinal uint64
        if ipr.Bits() <= bmKey.Bits() {
                ordinal, err = bm.SetAny(serial)
        } else {
                start, end := netiputil.SubnetRange(bmKey, iprMasked)
                ordinal, err = bm.SetAnyInRange(start, end, serial)
        }
        if err != nil {
                return netip.Addr{}, fmt.Errorf("add-any in '%s' range '%s': %w", as.pool, ipr, mapErr(err))
        }
        return ipbits.Add(bmKey.Addr(), ordinal, 0), nil
}

// Remove removes addr from the set or, if addr is not in the set's address range it
// returns an error. If addr is not in the set, it returns nil (removing an address
// that's not in the set is not an error).
func (as *AddrSet) Remove(addr netip.Addr) error {
        if !as.pool.Contains(addr) {
                return fmt.Errorf("%s cannot be removed from '%s'", addr, as.pool)
        }
        bm, bmKey, err := as.getBitmap(addr)
        if err != nil {
                return fmt.Errorf("remove '%s' from '%s': %w", addr, as.pool, err)
        }
        bit := netiputil.HostID(addr, as.prefixLenPerBitmap())
        if err := bm.Unset(bit); err != nil {
                return fmt.Errorf("unset bit %d for '%s' in '%s': %w", bit, addr, as.pool, err)
        }
        if bm.Bits()-bm.Unselected() == 0 {
                delete(as.bitmaps, bmKey)
        }
        return nil
}

// String returns a description of the address set.
func (as *AddrSet) String() string {
        if len(as.bitmaps) == 0 {
                return "empty address set"
        }
        if as.pool.Addr().BitLen()-as.pool.Bits() <= maxBitsPerBitmap {
                return as.bitmaps[as.pool].String()
        }
        bmStrings := make([]string, 0, len(as.bitmaps))
        for bmKey, bm := range as.bitmaps {
                bmStrings = append(bmStrings, fmt.Sprintf("range %s %s", bmKey, bm))
        }
        return strings.Join(bmStrings, " ")
}

func (as *AddrSet) getBitmap(addr netip.Addr) (*bitmap.Bitmap, netip.Prefix, error) {
        bits := as.pool.Addr().BitLen() - as.pool.Bits()
        if bits > maxBitsPerBitmap {
                bits = maxBitsPerBitmap
        }
        bmKey, err := addr.Prefix(as.pool.Addr().BitLen() - bits)
        if err != nil {
                return nil, netip.Prefix{}, err
        }
        bm, ok := as.bitmaps[bmKey]
        if !ok {
                bm = bitmap.New(as.addrsPerBitmap())
                as.bitmaps[bmKey] = bm
        }
        return bm, bmKey, nil
}

func (as *AddrSet) addrsPerBitmap() uint64 {
        bits := as.pool.Addr().BitLen() - as.pool.Bits()
        if bits > maxBitsPerBitmap {
                bits = maxBitsPerBitmap
        }
        return uint64(1) << bits
}

func (as *AddrSet) prefixLenPerBitmap() uint {
        bits := as.pool.Bits()
        if as.pool.Addr().Is6() && bits < minPrefixLen {
                return minPrefixLen
        }
        return uint(bits)
}

func mapErr(err error) error {
        if errors.Is(err, bitmap.ErrBitAllocated) {
                return ErrAllocated
        }
        if errors.Is(err, bitmap.ErrNoBitAvailable) {
                return ErrNotAvailable
        }
        return err
}

package caller

import (
        "runtime"
        "strings"
)

func callerInfo(i int) string {
        ptr, _, _, ok := runtime.Caller(i)
        fName := "unknown"
        if ok {
                f := runtime.FuncForPC(ptr)
                if f != nil {
                        // f.Name() is like: github.com/docker/libnetwork/caller.MethodName
                        tmp := strings.Split(f.Name(), ".")
                        if len(tmp) > 0 {
                                fName = tmp[len(tmp)-1]
                        }
                }
        }

        return fName
}

// Name returns the name of the function at the specified level.
// (level == 0 means current method name).
func Name(level int) string {
        return callerInfo(2 + level)
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package countmap

// Map is a map of counters.
type Map[T comparable] map[T]int

// Add adds delta to the counter for v and returns the new value.
//
// If the new value is 0, the entry is removed from the map.
func (m Map[T]) Add(v T, delta int) int {
        m[v] += delta
        c := m[v]
        if c == 0 {
                delete(m, v)
        }
        return c
}

package boltdb

import (
        "bytes"
        "encoding/binary"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "sync"
        "sync/atomic"
        "time"

        store "github.com/docker/docker/daemon/libnetwork/internal/kvstore"
        bolt "go.etcd.io/bbolt"
        berrors "go.etcd.io/bbolt/errors"
)

const filePerm = 0o644

// BoltDB type implements the Store interface
type BoltDB struct {
        mu         sync.Mutex
        client     *bolt.DB
        boltBucket []byte
        dbIndex    atomic.Uint64
        path       string
}

const libkvmetadatalen = 8

// New opens a new BoltDB connection to the specified path and bucket
func New(path, bucket string) (store.Store, error) {
        dir, _ := filepath.Split(path)
        if err := os.MkdirAll(dir, 0o750); err != nil {
                return nil, err
        }

        db, err := bolt.Open(path, filePerm, &bolt.Options{
                // The bbolt package opens the underlying db file and then issues an
                // exclusive flock to ensures that it can safely write to the db. If
                // it fails, it'll re-issue flocks every few ms until Timeout is
                // reached.
                // This nanosecond timeout bypasses that retry loop and make sure the
                // bbolt package returns an ErrTimeout straight away. That way, the
                // daemon, and unit tests, will fail fast and loudly instead of
                // silently introducing delays.
                Timeout: time.Nanosecond,
        })
        if err != nil {
                if errors.Is(err, berrors.ErrTimeout) {
                        return nil, fmt.Errorf("boltdb file %s is already open", path)
                }
                return nil, err
        }

        b := &BoltDB{
                client:     db,
                path:       path,
                boltBucket: []byte(bucket),
        }

        return b, nil
}

// Put the key, value pair. index number metadata is prepended to the value
func (b *BoltDB) Put(key string, value []byte) error {
        b.mu.Lock()
        defer b.mu.Unlock()

        return b.client.Update(func(tx *bolt.Tx) error {
                bucket, err := tx.CreateBucketIfNotExists(b.boltBucket)
                if err != nil {
                        return err
                }

                dbIndex := b.dbIndex.Add(1)
                dbval := make([]byte, 0, libkvmetadatalen+len(value))
                dbval = binary.LittleEndian.AppendUint64(dbval, dbIndex)
                dbval = append(dbval, value...)

                return bucket.Put([]byte(key), dbval)
        })
}

// Exists checks if the key exists inside the store
func (b *BoltDB) Exists(key string) (bool, error) {
        b.mu.Lock()
        defer b.mu.Unlock()

        var exists bool
        err := b.client.View(func(tx *bolt.Tx) error {
                bucket := tx.Bucket(b.boltBucket)
                if bucket == nil {
                        return store.ErrKeyNotFound
                }

                exists = len(bucket.Get([]byte(key))) > 0
                return nil
        })
        if err != nil {
                return false, err
        }
        if !exists {
                return false, store.ErrKeyNotFound
        }
        return true, nil
}

// List returns the range of keys starting with the passed in prefix
func (b *BoltDB) List(keyPrefix string) ([]*store.KVPair, error) {
        b.mu.Lock()
        defer b.mu.Unlock()

        var kv []*store.KVPair
        err := b.client.View(func(tx *bolt.Tx) error {
                bucket := tx.Bucket(b.boltBucket)
                if bucket == nil {
                        return store.ErrKeyNotFound
                }

                cursor := bucket.Cursor()
                prefix := []byte(keyPrefix)

                for key, v := cursor.Seek(prefix); bytes.HasPrefix(key, prefix); key, v = cursor.Next() {
                        dbIndex := binary.LittleEndian.Uint64(v[:libkvmetadatalen])
                        v = v[libkvmetadatalen:]
                        val := make([]byte, len(v))
                        copy(val, v)

                        kv = append(kv, &store.KVPair{
                                Key:       string(key),
                                Value:     val,
                                LastIndex: dbIndex,
                        })
                }
                return nil
        })
        if err != nil {
                return nil, err
        }
        if len(kv) == 0 {
                return nil, store.ErrKeyNotFound
        }
        return kv, nil
}

// AtomicDelete deletes a value at "key" if the key
// has not been modified in the meantime, throws an
// error if this is the case
func (b *BoltDB) AtomicDelete(key string, previous *store.KVPair) error {
        b.mu.Lock()
        defer b.mu.Unlock()

        if previous == nil {
                return store.ErrPreviousNotSpecified
        }

        return b.client.Update(func(tx *bolt.Tx) error {
                bucket := tx.Bucket(b.boltBucket)
                if bucket == nil {
                        return store.ErrKeyNotFound
                }

                val := bucket.Get([]byte(key))
                if val == nil {
                        return store.ErrKeyNotFound
                }
                dbIndex := binary.LittleEndian.Uint64(val[:libkvmetadatalen])
                if dbIndex != previous.LastIndex {
                        return store.ErrKeyModified
                }
                return bucket.Delete([]byte(key))
        })
}

// Delete deletes a value at "key". Unlike AtomicDelete it doesn't check
// whether the deleted key is at a specific version before deleting.
func (b *BoltDB) Delete(key string) error {
        b.mu.Lock()
        defer b.mu.Unlock()

        return b.client.Update(func(tx *bolt.Tx) error {
                bucket := tx.Bucket(b.boltBucket)
                if bucket == nil || bucket.Get([]byte(key)) == nil {
                        return store.ErrKeyNotFound
                }
                return bucket.Delete([]byte(key))
        })
}

// AtomicPut puts a value at "key" if the key has not been
// modified since the last Put, throws an error if this is the case
func (b *BoltDB) AtomicPut(key string, value []byte, previous *store.KVPair) (*store.KVPair, error) {
        b.mu.Lock()
        defer b.mu.Unlock()

        var dbIndex uint64
        err := b.client.Update(func(tx *bolt.Tx) error {
                bucket := tx.Bucket(b.boltBucket)
                if bucket == nil {
                        if previous != nil {
                                return store.ErrKeyNotFound
                        }
                        var err error
                        bucket, err = tx.CreateBucket(b.boltBucket)
                        if err != nil {
                                return err
                        }
                }
                // AtomicPut is equivalent to Put if previous is nil and the Ky
                // doesn't exist in the DB.
                val := bucket.Get([]byte(key))
                if previous == nil && len(val) != 0 {
                        return store.ErrKeyExists
                }
                if previous != nil {
                        if len(val) == 0 {
                                return store.ErrKeyNotFound
                        }
                        dbIndex = binary.LittleEndian.Uint64(val[:libkvmetadatalen])
                        if dbIndex != previous.LastIndex {
                                return store.ErrKeyModified
                        }
                }
                dbIndex = b.dbIndex.Add(1)
                dbval := make([]byte, 0, libkvmetadatalen+len(value))
                dbval = binary.LittleEndian.AppendUint64(dbval, dbIndex)
                dbval = append(dbval, value...)
                return bucket.Put([]byte(key), dbval)
        })
        if err != nil {
                return nil, err
        }
        return &store.KVPair{Key: key, Value: value, LastIndex: dbIndex}, nil
}

// Close the db connection to the BoltDB
func (b *BoltDB) Close() {
        b.mu.Lock()
        defer b.mu.Unlock()

        b.client.Close()
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package l2disco

import (
        "context"
        "encoding/binary"
        "fmt"
        "net"
        "slices"

        "golang.org/x/sys/unix"
)

var (
        arpTemplate = []byte{
                0x00, 0x01, // Hardware type
                0x08, 0x00, // Protocol
                0x06,       // Hardware address length
                0x04,       // IPv4 address length
                0x00, 0x01, // ARP request
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Sender MAC
                0x00, 0x00, 0x00, 0x00, // Sender IP
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Target MAC (always zeros)
                0x00, 0x00, 0x00, 0x00, // Target IP
        }
        bcastMAC = []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
)

type UnsolARP struct {
        pkt []byte
        sd  int
        sa  *unix.SockaddrLinklayer
}

// NewUnsolARP returns a pointer to an object that can send unsolicited ARPs on
// the interface with ifIndex, for ip and mac.
func NewUnsolARP(_ context.Context, ip net.IP, mac net.HardwareAddr, ifIndex int) (*UnsolARP, error) {
        sd, err := unix.Socket(unix.AF_PACKET, unix.SOCK_DGRAM|unix.SOCK_CLOEXEC, 0)
        if err != nil {
                return nil, fmt.Errorf("create socket: %w", err)
        }

        pkt := slices.Clone(arpTemplate)
        copy(pkt[8:14], mac)
        copy(pkt[14:18], ip)
        copy(pkt[24:28], ip)

        sa := &unix.SockaddrLinklayer{
                Protocol: htons(unix.ETH_P_ARP),
                Ifindex:  ifIndex,
                Hatype:   unix.ARPHRD_ETHER,
                Halen:    uint8(len(bcastMAC)),
        }
        copy(sa.Addr[:], bcastMAC)

        return &UnsolARP{
                pkt: pkt,
                sd:  sd,
                sa:  sa,
        }, nil
}

// Send sends an unsolicited ARP message.
func (ua *UnsolARP) Send() error {
        return unix.Sendto(ua.sd, ua.pkt, 0, ua.sa)
}

// Close releases resources.
func (ua *UnsolARP) Close() error {
        if ua.sd >= 0 {
                err := unix.Close(ua.sd)
                ua.sd = -1
                return err
        }
        return nil
}

// From https://github.com/mdlayher/packet/blob/f9999b41d9cfb0586e75467db1c81cfde4f965ba/packet_linux.go#L238-L248
func htons(i uint16) uint16 {
        var bigEndian [2]byte
        binary.BigEndian.PutUint16(bigEndian[:], i)
        return binary.NativeEndian.Uint16(bigEndian[:])
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package l2disco

import (
        "context"
        "fmt"
        "net"
        "slices"

        "github.com/containerd/log"
        "golang.org/x/net/ipv6"
)

var naTemplate = []byte{
        0x88,       // Type (136=NA)
        0x00,       // Code (always 0)
        0x00, 0x00, // Checksum (filled in by the kernel)
        0x20,             // Flags, Router=0, Solicited=0, Override=1
        0x00, 0x00, 0x00, // Reserved
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Target IP
        0x02,                               // Option - target link layer address
        0x01,                               // Option length (32)
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Target MAC
}

type UnsolNA struct {
        pkt []byte
        pc  *ipv6.PacketConn
        cm  *ipv6.ControlMessage
}

// NewUnsolNA returns a pointer to an object that can send unsolicited Neighbour
// Advertisements for ip and mac.
// https://datatracker.ietf.org/doc/html/rfc4861#section-4.4
func NewUnsolNA(ctx context.Context, ip net.IP, mac net.HardwareAddr, ifIndex int) (*UnsolNA, error) {
        // Open a socket ... it'll be bound to an address but the address doesn't matter,
        // no packets are to be received, and a source address is supplied when sending.
        netPC, err := net.ListenPacket("ip6:ipv6-icmp", "::1")
        if err != nil {
                return nil, err
        }
        pc := ipv6.NewPacketConn(netPC)

        // Block incoming packets.
        f := ipv6.ICMPFilter{}
        f.SetAll(true)
        if err := pc.SetICMPFilter(&f); err != nil {
                log.G(ctx).WithError(err).Errorf("failed to set ICMP filter")
        }

        cm := &ipv6.ControlMessage{
                // https://datatracker.ietf.org/doc/html/rfc4861#section-3.1
                //  By setting the Hop Limit to 255, Neighbor Discovery is immune to
                //  off-link senders that accidentally or intentionally send ND
                //  messages.
                HopLimit: 255,
                Src:      ip,
                IfIndex:  ifIndex,
        }

        pkt := slices.Clone(naTemplate)
        copy(pkt[8:24], ip)
        copy(pkt[26:32], mac)

        return &UnsolNA{
                pkt: pkt,
                pc:  pc,
                cm:  cm,
        }, nil
}

// Send sends an unsolicited ARP message.
func (un *UnsolNA) Send() error {
        n, err := un.pc.WriteTo(un.pkt, un.cm, &net.IPAddr{IP: net.IPv6linklocalallnodes})
        if err != nil {
                return err
        }
        if n != len(un.pkt) {
                return fmt.Errorf("failed to send packet: len:%d sent:%d", len(un.pkt), n)
        }
        return nil
}

// Close releases resources.
func (un *UnsolNA) Close() error {
        if un.pc != nil {
                err := un.pc.Close()
                un.pc = nil
                return err
        }
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package maputil

func FilterValues[K comparable, V any](in map[K]V, fn func(V) bool) []V {
        var out []V
        for _, v := range in {
                if fn(v) {
                        out = append(out, v)
                }
        }
        return out
}

// Package modprobe attempts to load kernel modules. It may have more success
// than simply running "modprobe", particularly for docker-in-docker.
package modprobe

import (
        "context"
        "errors"
        "fmt"
        "os/exec"
        "strings"

        "github.com/containerd/log"
        "golang.org/x/sys/unix"
)

// LoadModules attempts to load kernel modules, if necessary.
//
// isLoaded must be a function that checks whether the modules are loaded. It may
// be called multiple times. isLoaded must return an error to indicate that the
// modules still need to be loaded, otherwise nil.
//
// For each method of loading modules, LoadModules will attempt the load for each
// of modNames, then it will call isLoaded to check the result - moving on to try
// the next method if needed, and there is one.
//
// The returned error is the result of the final call to isLoaded.
func LoadModules(ctx context.Context, isLoaded func() error, modNames ...string) error {
        if isLoaded() == nil {
                log.G(ctx).WithFields(log.Fields{
                        "modules": modNames,
                }).Debug("Modules already loaded")
                return nil
        }

        if err := tryLoad(ctx, isLoaded, modNames, ioctlLoader{}); err != nil {
                return tryLoad(ctx, isLoaded, modNames, modprobeLoader{})
        }
        return nil
}

type loader interface {
        name() string
        load(modName string) error
}

func tryLoad(ctx context.Context, isLoaded func() error, modNames []string, loader loader) error {
        var loadErrs []error
        for _, modName := range modNames {
                if err := loader.load(modName); err != nil {
                        loadErrs = append(loadErrs, err)
                }
        }

        if checkResult := isLoaded(); checkResult != nil {
                log.G(ctx).WithFields(log.Fields{
                        "loader":      loader.name(),
                        "modules":     modNames,
                        "loadErrors":  errors.Join(loadErrs...),
                        "checkResult": checkResult,
                }).Debug("Modules not loaded")
                return checkResult
        }

        log.G(ctx).WithFields(log.Fields{
                "loader":     loader.name(),
                "modules":    modNames,
                "loadErrors": errors.Join(loadErrs...),
        }).Debug("Modules loaded")
        return nil
}

// ioctlLoader attempts to load the module using an ioctl() to get the interface index
// of a module - it won't have one, but the kernel may load the module. This tends to
// work in docker-in-docker, where the inner-docker may not have "modprobe" or access
// to modules in the host's filesystem.
type ioctlLoader struct{}

func (il ioctlLoader) name() string { return "ioctl" }

func (il ioctlLoader) load(modName string) error {
        sd, err := unix.Socket(unix.AF_INET, unix.SOCK_DGRAM, 0)
        if err != nil {
                return fmt.Errorf("creating socket for ioctl load of %s: %w", modName, err)
        }
        defer unix.Close(sd)

        // This tends to work, if running with CAP_SYS_MODULE, because...
        //   https://github.com/torvalds/linux/blob/6f7da290413ba713f0cdd9ff1a2a9bb129ef4f6c/net/core/dev_ioctl.c#L457
        //   https://github.com/torvalds/linux/blob/6f7da290413ba713f0cdd9ff1a2a9bb129ef4f6c/net/core/dev_ioctl.c#L371-L372
        ifreq, err := unix.NewIfreq(modName)
        if err != nil {
                return fmt.Errorf("creating ifreq for %s: %w", modName, err)
        }
        // An error is returned even if the module load is successful. So, ignore it.
        _ = unix.IoctlIfreq(sd, unix.SIOCGIFINDEX, ifreq)
        return nil
}

// modprobeLoader attempts to load a kernel module using modprobe.
type modprobeLoader struct{}

func (ml modprobeLoader) name() string { return "modprobe" }

func (ml modprobeLoader) load(modName string) error {
        out, err := exec.Command("modprobe", "-va", modName).CombinedOutput()
        if err != nil {
                return fmt.Errorf("modprobe %s failed with message: %q, error: %w", modName, strings.TrimSpace(string(out)), err)
        }
        return nil
}

package netiputil

import (
        "net"
        "net/netip"

        "github.com/docker/docker/daemon/libnetwork/ipbits"
)

// ToIPNet converts p into a *net.IPNet, returning nil if p is not valid.
func ToIPNet(p netip.Prefix) *net.IPNet {
        if !p.IsValid() {
                return nil
        }
        return &net.IPNet{
                IP:   p.Addr().AsSlice(),
                Mask: net.CIDRMask(p.Bits(), p.Addr().BitLen()),
        }
}

// ToPrefix converts n into a netip.Prefix. If n is not a valid IPv4 or IPV6
// address, ToPrefix returns netip.Prefix{}, false.
func ToPrefix(n *net.IPNet) (netip.Prefix, bool) {
        if ll := len(n.Mask); ll != net.IPv4len && ll != net.IPv6len {
                return netip.Prefix{}, false
        }

        addr, ok := netip.AddrFromSlice(n.IP)
        if !ok {
                return netip.Prefix{}, false
        }

        ones, bits := n.Mask.Size()
        if ones == 0 && bits == 0 {
                return netip.Prefix{}, false
        }

        return netip.PrefixFrom(addr.Unmap(), ones), true
}

// HostID masks out the 'bits' most-significant bits of addr. The result is
// undefined if bits > addr.BitLen().
func HostID(addr netip.Addr, bits uint) uint64 {
        return ipbits.Field(addr, bits, uint(addr.BitLen()))
}

// SubnetRange returns the amount to add to network.Addr() in order to yield the
// first and last addresses in subnet, respectively.
func SubnetRange(network, subnet netip.Prefix) (start, end uint64) {
        start = HostID(subnet.Addr(), uint(network.Bits()))
        end = start + (1 << uint64(subnet.Addr().BitLen()-subnet.Bits())) - 1
        return start, end
}

// AddrPortFromNet converts a net.Addr into a netip.AddrPort.
func AddrPortFromNet(addr net.Addr) netip.AddrPort {
        if a, ok := addr.(interface{ AddrPort() netip.AddrPort }); ok {
                return a.AddrPort()
        }
        return netip.AddrPort{}
}

// LastAddr returns the last address of prefix 'p'.
func LastAddr(p netip.Prefix) netip.Addr {
        return ipbits.Add(p.Addr().Prev(), 1, uint(p.Addr().BitLen()-p.Bits()))
}

// PrefixCompare two prefixes and return a negative, 0, or a positive integer as
// required by [slices.SortFunc]. When two prefixes with the same address is
// provided, the shortest one will be sorted first.
func PrefixCompare(a, b netip.Prefix) int {
        cmp := a.Addr().Compare(b.Addr())
        if cmp != 0 {
                return cmp
        }
        return a.Bits() - b.Bits()
}

// PrefixAfter returns the prefix of size 'sz' right after 'prev'.
func PrefixAfter(prev netip.Prefix, sz int) netip.Prefix {
        s := sz
        if prev.Bits() < sz {
                s = prev.Bits()
        }
        addr := ipbits.Add(prev.Addr(), 1, uint(prev.Addr().BitLen()-s))
        if addr.IsUnspecified() {
                return netip.Prefix{}
        }
        return netip.PrefixFrom(addr, sz).Masked()
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

// Package nftables provides methods to create an nftables table and manage its maps, sets,
// chains, and rules.
//
// To use it, the first step is to create a [TableRef] using [NewTable]. The table can
// then be populated and managed using that ref.
//
// Modifications to the table are only applied (sent to "nft") when [TableRef.Apply] is
// called. This means a number of updates can be made, for example, adding all the
// rules needed for a docker network - and those rules will then be applied atomically
// in a single "nft" run.
//
// [TableRef.Apply] can only be called after [Enable], and only if [Enable] returns
// true (meaning an "nft" executable was found). [Enabled] can be called to check
// whether nftables has been enabled.
//
// Be aware:
//   - The implementation is far from complete, only functionality needed so-far has
//     been included. Currently, there's only a limited set of chain/map/set types,
//     there's no way to delete sets/maps etc.
//   - There's no rollback so, once changes have been made to a TableRef, if the
//     Apply fails there is no way to undo changes. The TableRef will be out-of-sync
//     with the actual state of nftables.
//   - This is a thin layer between code and "nft", it doesn't do much error checking. So,
//     for example, if you get the syntax of a rule wrong the issue won't be reported
//     until Apply is called.
//   - Also in the category of no-error-checking, there's no reference checking. If you
//     delete a chain that's still referred to by a map, set or another chain, "nft" will
//     report an error when Apply is called.
//   - Error checking here is meant to help spot logical errors in the code, like adding
//     a rule twice, which would be fine by "nft" as it'd just create a duplicate rule.
//   - The existing state of a table in the ruleset is irrelevant, once a Table is created
//     by this package it will be flushed. Putting it another way, this package is
//     write-only, it does not load any state from the host.
//   - Errors from "nft" are logged along with the line-numbered command that failed,
//     that's the place to look when things go wrong.
package nftables

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "io"
        "os/exec"
        "slices"
        "strconv"
        "strings"
        "sync"
        "text/template"

        "github.com/containerd/log"
        "go.opentelemetry.io/otel"
)

// Prefix for OTEL span names.
const spanPrefix = "libnetwork.internal.nftables"

var (
        // nftPath is the path of the "nft" tool, set by [Enable] and left empty if the tool
        // is not present - in which case, nftables is disabled.
        nftPath string
        // Error returned by Enable if nftables could not be initialised.
        nftEnableError error
        // incrementalUpdateTempl is a parsed text/template, used to apply incremental updates.
        incrementalUpdateTempl *template.Template
        // reloadTempl is a parsed text/template, used to apply a whole table.
        reloadTempl *template.Template
        // enableOnce is used by [Enable] to avoid checking the path for "nft" more than once.
        enableOnce sync.Once
)

var (
        // ErrRuleExist is returned when a rule is added, but it already exists in the same
        // rule group of a chain.
        ErrRuleExist = errors.New("rule exists")
        // ErrRuleNotExist is returned when a rule is removed, but does not exist in the
        // rule group of a chain.
        ErrRuleNotExist = errors.New("rule does not exist")
)

// BaseChainType enumerates the base chain types.
// See https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains#Base_chain_types
type BaseChainType string

const (
        BaseChainTypeFilter BaseChainType = "filter"
        BaseChainTypeRoute  BaseChainType = "route"
        BaseChainTypeNAT    BaseChainType = "nat"
)

// BaseChainHook enumerates the base chain hook types.
// See https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains#Base_chain_hooks
type BaseChainHook string

const (
        BaseChainHookIngress     BaseChainHook = "ingress"
        BaseChainHookPrerouting  BaseChainHook = "prerouting"
        BaseChainHookInput       BaseChainHook = "input"
        BaseChainHookForward     BaseChainHook = "forward"
        BaseChainHookOutput      BaseChainHook = "output"
        BaseChainHookPostrouting BaseChainHook = "postrouting"
)

// Standard priority values for base chains.
// (Not for the bridge family, those are different.)
const (
        BaseChainPriorityRaw      = -300
        BaseChainPriorityMangle   = -150
        BaseChainPriorityDstNAT   = -100
        BaseChainPriorityFilter   = 0
        BaseChainPrioritySecurity = 50
        BaseChainPrioritySrcNAT   = 100
)

// Family enumerates address families.
type Family string

const (
        IPv4 Family = "ip"
        IPv6 Family = "ip6"
)

// nftType enumerates nft types that can be used to define maps/sets etc.
type nftType string

const (
        nftTypeIPv4Addr    nftType = "ipv4_addr"
        nftTypeIPv6Addr    nftType = "ipv6_addr"
        nftTypeEtherAddr   nftType = "ether_addr"
        nftTypeInetProto   nftType = "inet_proto"
        nftTypeInetService nftType = "inet_service"
        nftTypeMark        nftType = "mark"
        nftTypeIfname      nftType = "ifname"
)

// Enable tries once to initialise nftables.
func Enable() error {
        enableOnce.Do(func() {
                path, err := exec.LookPath("nft")
                if err != nil {
                        log.G(context.Background()).WithError(err).Warnf("Failed to find nft tool")
                        nftEnableError = fmt.Errorf("failed to find nft tool: %w", err)
                        return
                }
                if err := parseTemplate(); err != nil {
                        log.G(context.Background()).WithError(err).Error("Internal error while initialising nftables")
                        nftEnableError = fmt.Errorf("internal error while initialising nftables: %w", err)
                        return
                }
                nftPath = path
        })
        return nftEnableError
}

// Enabled returns true if the "nft" tool is available and [Enable] has been called.
func Enabled() bool {
        return nftPath != ""
}

// Disable undoes Enable. Intended for unit testing.
func Disable() {
        nftPath = ""
        incrementalUpdateTempl = nil
        reloadTempl = nil
        enableOnce = sync.Once{}
}

//////////////////////////////
// Tables

// table is the internal representation of an nftables table.
// Its elements need to be exported for use by text/template, but they should only be
// manipulated via exported methods.
type table struct {
        Name   string
        Family Family

        VMaps  map[string]*vMap
        Sets   map[string]*set
        Chains map[string]*chain

        Dirty               bool // Set when the table is new, not when its elements change.
        DeleteChainCommands []string
}

// TableRef is a handle for an nftables table.
type TableRef struct {
        t *table
}

// NewTable creates a new nftables table and returns a [TableRef]
//
// See https://wiki.nftables.org/wiki-nftables/index.php/Configuring_tables
//
// The table will be created and flushed when [TableRef.Apply] is next called.
// It's flushed in case it already exists in the host's nftables - when that
// happens, rules in its chains will be deleted but not the chains themselves,
// maps, sets, or elements of maps or sets. But, those un-flushed items can't do
// anything disruptive unless referred to by rules, and they will be flushed if
// they get re-created via the [TableRef], when [TableRef.Apply] is next called
// (so, before they can be used by a new rule).
func NewTable(family Family, name string) (TableRef, error) {
        t := TableRef{
                t: &table{
                        Name:   name,
                        Family: family,
                        VMaps:  map[string]*vMap{},
                        Sets:   map[string]*set{},
                        Chains: map[string]*chain{},
                        Dirty:  true,
                },
        }
        return t, nil
}

// Family returns the address family of the nftables table described by [TableRef].
func (t TableRef) Family() Family {
        return t.t.Family
}

// incrementalUpdateTemplText is used with text/template to generate an nftables command file
// (which will be applied atomically). Updates using this template are always incremental.
// Steps are:
//   - declare the table and its sets/maps with empty versions of modified chains, so that
//     they can be flushed/deleted if they don't yet exist. (They need to be flushed in case
//     a version of them was left behind by an old incarnation of the daemon. But, it's an
//     error to flush or delete something that doesn't exist. So, avoid having to parse nft's
//     stderr to work out what happened by making sure they do exist before flushing.)
//   - if the table is newly declared, flush rules from its chains
//   - flush each newly declared map/set
//   - delete deleted map/set elements
//   - flush modified chains
//   - delete deleted chains
//   - re-populate modified chains
//   - add new map/set elements
const incrementalUpdateTemplText = `{{$family := .Family}}{{$tableName := .Name}}
table {{$family}} {{$tableName}} {
        {{range .VMaps}}map {{.Name}} {
                type {{.ElementType}} : verdict
                {{if len .Flags}}flags{{range .Flags}} {{.}}{{end}}{{end}}
        }
        {{end}}
        {{range .Sets}}set {{.Name}} {
                type {{.ElementType}}
                {{if len .Flags}}flags{{range .Flags}} {{.}}{{end}}{{end}}
        }
        {{end}}
        {{range .Chains}}{{if .Dirty}}chain {{.Name}} {
                {{if .ChainType}}type {{.ChainType}} hook {{.Hook}} priority {{.Priority}}; policy {{.Policy}}{{end}}
        } ; {{end}}{{end}}
}
{{if .Dirty}}flush table {{$family}} {{$tableName}}{{end}}
{{range .VMaps}}{{if .Dirty}}flush map {{$family}} {{$tableName}} {{.Name}}
{{end}}{{end}}
{{range .Sets}}{{if .Dirty}}flush set {{$family}} {{$tableName}} {{.Name}}
{{end}}{{end}}
{{range .Chains}}{{if .Dirty}}flush chain {{$family}} {{$tableName}} {{.Name}}
{{end}}{{end}}
{{range .VMaps}}{{if .DeletedElements}}delete element {{$family}} {{$tableName}} {{.Name}} { {{range $k,$v := .DeletedElements}}{{$k}}, {{end}} }
{{end}}{{end}}
{{range .Sets}}{{if .DeletedElements}}delete element {{$family}} {{$tableName}} {{.Name}} { {{range $k,$v := .DeletedElements}}{{$k}}, {{end}} }
{{end}}{{end}}
{{range .DeleteChainCommands}}{{.}}
{{end}}
table {{$family}} {{$tableName}} {
        {{range .Chains}}{{if .Dirty}}chain {{.Name}} {
                {{if .ChainType}}type {{.ChainType}} hook {{.Hook}} priority {{.Priority}}; policy {{.Policy}}{{end}}
                {{range .Rules}}{{.}}
                {{end}}
        }
        {{end}}{{end}}
}
{{range .VMaps}}{{if .AddedElements}}add element {{$family}} {{$tableName}} {{.Name}} { {{range $k,$v := .AddedElements}}{{$k}} : {{$v}}, {{end}} }
{{end}}{{end}}
{{range .Sets}}{{if .AddedElements}}add element {{$family}} {{$tableName}} {{.Name}} { {{range $k,$v := .AddedElements}}{{$k}}, {{end}} }
{{end}}{{end}}
`

// reloadTemplText is used with text/template to generate an nftables command file
// (which will be applied atomically), to fully re-create a table.
//
// It first declares the table so if it doesn't already exist, it can be deleted.
// Then it deletes the table and re-creates it.
const reloadTemplText = `{{$family := .Family}}{{$tableName := .Name}}
table {{$family}} {{$tableName}} {}
delete table {{$family}} {{$tableName}}
table {{$family}} {{$tableName}} {
        {{range .VMaps}}map {{.Name}} {
                type {{.ElementType}} : verdict
                {{if len .Flags}}flags{{range .Flags}} {{.}}{{end}}{{end}}
        {{if .Elements}}elements = {
                        {{range $k,$v := .Elements}}{{$k}} : {{$v}},
            {{end -}}
                }{{end}}
        }
        {{end}}
        {{range .Sets}}set {{.Name}} {
                type {{.ElementType}}
                {{if len .Flags}}flags{{range .Flags}} {{.}}{{end}}{{end}}
        {{if .Elements}}elements = {
                        {{range $k,$v := .Elements}}{{$k}},
            {{end -}}
                }{{end}}
        }
        {{end}}
        {{range .Chains}}chain {{.Name}} {
                {{if .ChainType}}type {{.ChainType}} hook {{.Hook}} priority {{.Priority}}; policy {{.Policy}}{{end}}
                {{range .Rules}}{{.}}
                {{end}}
        }
        {{end}}
}
`

// Apply makes incremental updates to nftables, corresponding to changes to the [TableRef]
// since Apply was last called.
func (t TableRef) Apply(ctx context.Context) error {
        if !Enabled() {
                return errors.New("nftables is not enabled")
        }

        // Update nftables.
        var buf bytes.Buffer
        if err := incrementalUpdateTempl.Execute(&buf, t.t); err != nil {
                return fmt.Errorf("failed to execute template nft ruleset: %w", err)
        }

        if err := nftApply(ctx, buf.Bytes()); err != nil {
                // On error, log a line-numbered version of the generated "nft" input (because
                // nft error messages refer to line numbers).
                var sb strings.Builder
                for i, line := range bytes.SplitAfter(buf.Bytes(), []byte("\n")) {
                        sb.WriteString(strconv.Itoa(i + 1))
                        sb.WriteString(":\t")
                        sb.Write(line)
                }
                log.G(ctx).Error("nftables: failed to update nftables:\n", sb.String(), "\n", err)

                // It's possible something destructive has happened to nftables. For example, in
                // integration-cli tests, tests start daemons in the same netns as the integration
                // test's own daemon. They don't always use their own daemon, but they tend to leave
                // behind networks for the test infrastructure to clean up between tests. Starting
                // a daemon flushes the "docker-bridges" table, so the cleanup fails to delete a
                // rule that's been flushed. So, try reloading the whole table to get back in-sync.
                return t.Reload(ctx)
        }

        // Note that updates have been applied.
        t.t.updatesApplied()
        return nil
}

// Reload deletes the table, then re-creates it, atomically.
func (t TableRef) Reload(ctx context.Context) error {
        if !Enabled() {
                return errors.New("nftables is not enabled")
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{"table": t.t.Name, "family": t.t.Family}))
        log.G(ctx).Warn("nftables: reloading table")

        // Build the update.
        var buf bytes.Buffer
        if err := reloadTempl.Execute(&buf, t.t); err != nil {
                return fmt.Errorf("failed to execute reload template: %w", err)
        }

        if err := nftApply(ctx, buf.Bytes()); err != nil {
                // On error, log a line-numbered version of the generated "nft" input (because
                // nft error messages refer to line numbers).
                var sb strings.Builder
                for i, line := range bytes.SplitAfter(buf.Bytes(), []byte("\n")) {
                        sb.WriteString(strconv.Itoa(i + 1))
                        sb.WriteString(":\t")
                        sb.Write(line)
                }
                log.G(ctx).Error("nftables: failed to reload nftable:\n", sb.String(), "\n", err)
                return err
        }

        // Note that updates have been applied.
        t.t.updatesApplied()
        return nil
}

// ////////////////////////////
// Chains

// RuleGroup is used to allocate rules within a chain to a group. These groups are
// purely an internal construct, nftables knows nothing about them. Within groups
// rules retain the order in which they were added, and groups are ordered from
// lowest to highest numbered group.
type RuleGroup int

// chain is the internal representation of an nftables chain.
// Its elements need to be exported for use by text/template, but they should only be
// manipulated via exported methods.
type chain struct {
        table      *table
        Name       string
        ChainType  BaseChainType
        Hook       BaseChainHook
        Priority   int
        Policy     string
        Dirty      bool
        ruleGroups map[RuleGroup][]string
}

// ChainRef is a handle for an nftables chain.
type ChainRef struct {
        c *chain
}

// BaseChain constructs a new nftables base chain and returns a [ChainRef].
//
// See https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains#Adding_base_chains
//
// It is an error to create a base chain that already exists.
// If the underlying chain already exists, it will be flushed by the
// next [TableRef.Apply] before new rules are added.
func (t TableRef) BaseChain(ctx context.Context, name string, chainType BaseChainType, hook BaseChainHook, priority int) (ChainRef, error) {
        if _, ok := t.t.Chains[name]; ok {
                return ChainRef{}, fmt.Errorf("chain %q already exists", name)
        }
        c := &chain{
                table:      t.t,
                Name:       name,
                ChainType:  chainType,
                Hook:       hook,
                Priority:   priority,
                Policy:     "accept",
                Dirty:      true,
                ruleGroups: map[RuleGroup][]string{},
        }
        t.t.Chains[name] = c
        log.G(ctx).WithFields(log.Fields{
                "family": t.t.Family,
                "table":  t.t.Name,
                "chain":  name,
                "type":   chainType,
                "hook":   hook,
                "prio":   priority,
        }).Debug("nftables: created base chain")
        return ChainRef{c: c}, nil
}

// Chain returns a [ChainRef] for an existing chain (which may be a base chain).
// If there is no existing chain, a regular chain is added and its [ChainRef] is
// returned.
//
// See https://wiki.nftables.org/wiki-nftables/index.php/Configuring_chains#Adding_regular_chains
//
// If a new [ChainRef] is created and the underlying chain already exists, it
// will be flushed by the next [TableRef.Apply] before new rules are added.
func (t TableRef) Chain(ctx context.Context, name string) ChainRef {
        c, ok := t.t.Chains[name]
        if !ok {
                c = &chain{
                        table:      t.t,
                        Name:       name,
                        Dirty:      true,
                        ruleGroups: map[RuleGroup][]string{},
                }
                t.t.Chains[name] = c
        }
        log.G(ctx).WithFields(log.Fields{
                "family": t.t.Family,
                "table":  t.t.Name,
                "chain":  name,
        }).Debug("nftables: created chain")
        return ChainRef{c: c}
}

// ChainUpdateFunc is a function that can add rules to a chain, or remove rules from it.
type ChainUpdateFunc func(context.Context, RuleGroup, string, ...interface{}) error

// ChainUpdateFunc returns a [ChainUpdateFunc] to add rules to the named chain if
// enable is true, or to remove rules from the chain if enable is false.
// (Written as a convenience function to ease migration of iptables functions
// originally written with an enable flag.)
func (t TableRef) ChainUpdateFunc(ctx context.Context, name string, enable bool) ChainUpdateFunc {
        c := t.Chain(ctx, name)
        if enable {
                return c.AppendRule
        }
        return c.DeleteRule
}

// DeleteChain deletes a chain. It is an error to delete a chain that does not exist.
func (t TableRef) DeleteChain(ctx context.Context, name string) error {
        if _, ok := t.t.Chains[name]; !ok {
                return fmt.Errorf("chain %q does not exist", name)
        }
        delete(t.t.Chains, name)
        t.t.DeleteChainCommands = append(t.t.DeleteChainCommands,
                fmt.Sprintf("delete chain %s %s %s", t.t.Family, t.t.Name, name))
        log.G(ctx).WithFields(log.Fields{
                "family": t.t.Family,
                "table":  t.t.Name,
                "chain":  name,
        }).Debug("nftables: deleted chain")
        return nil
}

// SetPolicy sets the default policy for a base chain. It is an error to call this
// for a non-base [ChainRef].
func (c ChainRef) SetPolicy(policy string) error {
        if c.c.ChainType == "" {
                return errors.New("not a base chain")
        }
        c.c.Policy = policy
        c.c.Dirty = true
        return nil
}

// AppendRule appends a rule to a [RuleGroup] in a [ChainRef].
func (c ChainRef) AppendRule(ctx context.Context, group RuleGroup, rule string, args ...interface{}) error {
        if len(args) > 0 {
                rule = fmt.Sprintf(rule, args...)
        }
        if rg, ok := c.c.ruleGroups[group]; ok && slices.Contains(rg, rule) {
                return ErrRuleExist
        }
        c.c.ruleGroups[group] = append(c.c.ruleGroups[group], rule)
        c.c.Dirty = true
        log.G(ctx).WithFields(log.Fields{
                "family": c.c.table.Family,
                "table":  c.c.table.Name,
                "chain":  c.c.Name,
                "group":  group,
                "rule":   rule,
        }).Debug("nftables: appended rule")
        return nil
}

// AppendRuleCf calls AppendRule and returns a cleanup function or an error.
func (c ChainRef) AppendRuleCf(ctx context.Context, group RuleGroup, rule string, args ...interface{}) (func(context.Context) error, error) {
        if err := c.AppendRule(ctx, group, rule, args...); err != nil {
                return nil, err
        }
        return func(ctx context.Context) error { return c.DeleteRule(ctx, group, rule, args...) }, nil
}

// DeleteRule deletes a rule from a [RuleGroup] in a [ChainRef]. It is an error
// to delete from a group that does not exist, or to delete a rule that does not
// exist.
func (c ChainRef) DeleteRule(ctx context.Context, group RuleGroup, rule string, args ...interface{}) error {
        if len(args) > 0 {
                rule = fmt.Sprintf(rule, args...)
        }
        rg, ok := c.c.ruleGroups[group]
        if !ok {
                return fmt.Errorf("rule group %d does not exist", group)
        }
        origLen := len(rg)
        c.c.ruleGroups[group] = slices.DeleteFunc(rg, func(r string) bool { return r == rule })
        if len(c.c.ruleGroups[group]) == origLen {
                return ErrRuleNotExist
        }
        c.c.Dirty = true
        log.G(ctx).WithFields(log.Fields{
                "family": c.c.table.Family,
                "table":  c.c.table.Name,
                "chain":  c.c.Name,
                "rule":   rule,
        }).Debug("nftables: deleted rule")
        return nil
}

// ////////////////////////////
// VMaps

// vMap is the internal representation of an nftables verdict map.
// Its elements need to be exported for use by text/template, but they should only be
// manipulated via exported methods.
type vMap struct {
        table           *table
        Name            string
        ElementType     nftType
        Flags           []string
        Elements        map[string]string
        Dirty           bool // New vMap, needs to be flushed (not set when elements are added/deleted).
        AddedElements   map[string]string
        DeletedElements map[string]struct{}
}

// VMapRef is a handle for an nftables verdict map.
type VMapRef struct {
        v *vMap
}

// InterfaceVMap creates a map from interface name to a verdict and returns a [VMapRef],
// or returns an existing [VMapRef] if it has already been created.
//
// See https://wiki.nftables.org/wiki-nftables/index.php/Verdict_Maps_(vmaps)
//
// If a [VMapRef] is created and the underlying map already exists, it will be flushed
// by the next [TableRef.Apply] before new elements are added.
func (t TableRef) InterfaceVMap(ctx context.Context, name string) VMapRef {
        if vmap, ok := t.t.VMaps[name]; ok {
                return VMapRef{vmap}
        }
        vmap := &vMap{
                table:           t.t,
                Name:            name,
                ElementType:     nftTypeIfname,
                Elements:        map[string]string{},
                AddedElements:   map[string]string{},
                DeletedElements: map[string]struct{}{},
                Dirty:           true,
        }
        t.t.VMaps[name] = vmap
        log.G(ctx).WithFields(log.Fields{
                "family": t.t.Family,
                "table":  t.t.Name,
                "vmap":   name,
        }).Debug("nftables: created interface vmap")
        return VMapRef{vmap}
}

// AddElement adds an element to a verdict map. The caller must ensure the key has
// the correct type. It is an error to add a key that already exists.
func (v VMapRef) AddElement(ctx context.Context, key string, verdict string) error {
        if _, ok := v.v.Elements[key]; ok {
                return fmt.Errorf("verdict map already contains element %q", key)
        }
        v.v.Elements[key] = verdict
        v.v.AddedElements[key] = verdict
        log.G(ctx).WithFields(log.Fields{
                "family":  v.v.table.Family,
                "table":   v.v.table.Name,
                "vmap":    v.v.Name,
                "key":     key,
                "verdict": verdict,
        }).Debug("nftables: added vmap element")
        return nil
}

// AddElementCf calls AddElement and returns a cleanup function or an error.
func (v VMapRef) AddElementCf(ctx context.Context, key string, verdict string) (func(context.Context) error, error) {
        if err := v.AddElement(ctx, key, verdict); err != nil {
                return nil, err
        }
        return func(ctx context.Context) error { return v.DeleteElement(ctx, key) }, nil
}

// DeleteElement deletes an element from a verdict map. It is an error to delete
// an element that does not exist.
func (v VMapRef) DeleteElement(ctx context.Context, key string) error {
        if _, ok := v.v.Elements[key]; !ok {
                return fmt.Errorf("verdict map does not contain element %q", key)
        }
        delete(v.v.Elements, key)
        v.v.DeletedElements[key] = struct{}{}
        log.G(ctx).WithFields(log.Fields{
                "family": v.v.table.Family,
                "table":  v.v.table.Name,
                "vmap":   v.v.Name,
                "key":    key,
        }).Debug("nftables: deleted vmap element")
        return nil
}

// ////////////////////////////
// Sets

// set is the internal representation of an nftables set.
// Its elements need to be exported for use by text/template, but they should only be
// manipulated via exported methods.
type set struct {
        table           *table
        Name            string
        ElementType     nftType
        Flags           []string
        Elements        map[string]struct{}
        Dirty           bool // New set, needs to be flushed (not set when elements are added/deleted).
        AddedElements   map[string]struct{}
        DeletedElements map[string]struct{}
}

// SetRef is a handle for an nftables named set.
type SetRef struct {
        s *set
}

// PrefixSet creates a new named nftables set for IPv4 or IPv6 addresses (depending
// on the address family of the [TableRef]), and returns its [SetRef]. Or, if the
// set has already been created, its [SetRef] is returned.
//
// ([TableRef] does not support "inet", only "ip" or "ip6". So the element type can
// always be determined. But, there's no "inet" element type, so this will need to
// change if we need an "inet" table.)
//
// See https://wiki.nftables.org/wiki-nftables/index.php/Sets#Named_sets
func (t TableRef) PrefixSet(ctx context.Context, name string) SetRef {
        if s, ok := t.t.Sets[name]; ok {
                return SetRef{s}
        }
        s := &set{
                table:           t.t,
                Name:            name,
                Elements:        map[string]struct{}{},
                ElementType:     nftTypeIPv4Addr,
                Flags:           []string{"interval"},
                Dirty:           true,
                AddedElements:   map[string]struct{}{},
                DeletedElements: map[string]struct{}{},
        }
        if t.t.Family == IPv6 {
                s.ElementType = nftTypeIPv6Addr
        }
        t.t.Sets[name] = s
        log.G(ctx).WithFields(log.Fields{
                "family": t.t.Family,
                "table":  t.t.Name,
                "set":    name,
        }).Debug("nftables: created set")
        return SetRef{s}
}

// AddElement adds an element to a set. It is the caller's responsibility to make sure
// the element has the correct type. It is an error to add an element that is already
// in the set.
func (s SetRef) AddElement(ctx context.Context, element string) error {
        if _, ok := s.s.Elements[element]; ok {
                return fmt.Errorf("set already contains element %q", element)
        }
        s.s.Elements[element] = struct{}{}
        s.s.AddedElements[element] = struct{}{}
        log.G(ctx).WithFields(log.Fields{
                "family":  s.s.table.Family,
                "table":   s.s.table.Name,
                "set":     s.s.Name,
                "element": element,
        }).Debug("nftables: added set element")
        return nil
}

// DeleteElement deletes an element from the set. It is an error to delete an
// element that is not in the set.
func (s SetRef) DeleteElement(ctx context.Context, element string) error {
        if _, ok := s.s.Elements[element]; !ok {
                return fmt.Errorf("set does not contain element %q", element)
        }
        delete(s.s.Elements, element)
        s.s.DeletedElements[element] = struct{}{}
        log.G(ctx).WithFields(log.Fields{
                "family":  s.s.table.Family,
                "table":   s.s.table.Name,
                "set":     s.s.Name,
                "element": element,
        }).Debug("nftables: deleted set element")
        return nil
}

// ////////////////////////////
// Internal

func (t *table) updatesApplied() {
        t.DeleteChainCommands = t.DeleteChainCommands[:0]
        for _, c := range t.Chains {
                c.Dirty = false
        }
        for _, m := range t.VMaps {
                m.Dirty = false
                m.AddedElements = map[string]string{}
                m.DeletedElements = map[string]struct{}{}
        }
        for _, s := range t.Sets {
                s.Dirty = false
                s.AddedElements = map[string]struct{}{}
                s.DeletedElements = map[string]struct{}{}
        }
        t.Dirty = false
}

/* Can't make text/template range over this, not sure why ...
func (c *chain) Rules() iter.Seq[string] {
        groups := make([]int, 0, len(c.ruleGroups))
        for group := range c.ruleGroups {
                groups = append(groups, group)
        }
        slices.Sort(groups)
        return func(yield func(string) bool) {
                for _, group := range groups {
                        for _, rule := range c.ruleGroups[group] {
                                if !yield(rule) {
                                        return
                                }
                        }
                }
        }
}
*/

// Rules returns the chain's rules, in order.
func (c *chain) Rules() []string {
        groups := make([]RuleGroup, 0, len(c.ruleGroups))
        nRules := 0
        for group := range c.ruleGroups {
                groups = append(groups, group)
                nRules += len(c.ruleGroups[group])
        }
        slices.Sort(groups)
        rules := make([]string, 0, nRules)
        for _, group := range groups {
                rules = append(rules, c.ruleGroups[group]...)
        }
        return rules
}

func parseTemplate() error {
        var err error
        incrementalUpdateTempl, err = template.New("ruleset").Parse(incrementalUpdateTemplText)
        if err != nil {
                return fmt.Errorf("parsing 'incrementalUpdateTemplText': %w", err)
        }
        reloadTempl, err = template.New("ruleset").Parse(reloadTemplText)
        if err != nil {
                return fmt.Errorf("parsing 'reloadTemplText': %w", err)
        }
        return nil
}

// nftApply runs the "nft" command.
func nftApply(ctx context.Context, nftCmd []byte) error {
        ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".nftApply")
        defer span.End()

        if !Enabled() {
                return errors.New("nftables is not enabled")
        }
        cmd := exec.Command(nftPath, "-f", "-")
        stdinPipe, err := cmd.StdinPipe()
        if err != nil {
                return fmt.Errorf("getting stdin pipe for nft: %w", err)
        }
        stdoutPipe, err := cmd.StdoutPipe()
        if err != nil {
                return fmt.Errorf("getting stdout pipe for nft: %w", err)
        }
        stderrPipe, err := cmd.StderrPipe()
        if err != nil {
                return fmt.Errorf("getting stderr pipe for nft: %w", err)
        }

        if err := cmd.Start(); err != nil {
                return fmt.Errorf("starting nft: %w", err)
        }
        if _, err := stdinPipe.Write(nftCmd); err != nil {
                return fmt.Errorf("sending nft commands: %w", err)
        }
        if err := stdinPipe.Close(); err != nil {
                return fmt.Errorf("closing nft input pipe: %w", err)
        }

        stdoutBuf := strings.Builder{}
        if _, err := io.Copy(&stdoutBuf, stdoutPipe); err != nil {
                return fmt.Errorf("reading stdout of nft: %w", err)
        }
        stdout := stdoutBuf.String()
        stderrBuf := strings.Builder{}
        if _, err := io.Copy(&stderrBuf, stderrPipe); err != nil {
                return fmt.Errorf("reading stderr of nft: %w", err)
        }
        stderr := stderrBuf.String()

        err = cmd.Wait()
        if err != nil {
                return fmt.Errorf("running nft: %s %w", stderr, err)
        }
        log.G(ctx).WithFields(log.Fields{"stdout": stdout, "stderr": stderr}).Debug("nftables: updated")
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

// Package resolvconf is used to generate a container's /etc/resolv.conf file.
//
// Constructor Load and Parse read a resolv.conf file from the filesystem or
// a reader respectively, and return a ResolvConf object.
//
// The ResolvConf object can then be updated with overrides for nameserver,
// search domains, and DNS options.
//
// ResolvConf can then be transformed to make it suitable for legacy networking,
// a network with an internal nameserver, or used as-is for host networking.
//
// This package includes methods to write the file for the container, along with
// a hash that can be used to detect modifications made by the user to avoid
// overwriting those updates.
package resolvconf

import (
        "bufio"
        "bytes"
        "context"
        "io"
        "io/fs"
        "net/netip"
        "os"
        "slices"
        "strconv"
        "strings"

        "github.com/containerd/log"
        "github.com/moby/sys/atomicwriter"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

// Fallback nameservers, to use if none can be obtained from the host or command
// line options.
var (
        defaultIPv4NSs = []netip.Addr{
                netip.MustParseAddr("8.8.8.8"),
                netip.MustParseAddr("8.8.4.4"),
        }
        defaultIPv6NSs = []netip.Addr{
                netip.MustParseAddr("2001:4860:4860::8888"),
                netip.MustParseAddr("2001:4860:4860::8844"),
        }
)

// ResolvConf represents a resolv.conf file. It can be constructed by
// reading a resolv.conf file, using method Parse().
type ResolvConf struct {
        nameServers []netip.Addr
        search      []string
        options     []string
        other       []string // Unrecognised directives from the host's file, if any.

        md metadata
}

// ExtDNSEntry represents a nameserver address that was removed from the
// container's resolv.conf when it was transformed by TransformForIntNS(). These
// are addresses read from the host's file, or applied via an override ('--dns').
type ExtDNSEntry struct {
        Addr         netip.Addr
        HostLoopback bool // The address is loopback, in the host's namespace.
}

func (ed ExtDNSEntry) String() string {
        if ed.HostLoopback {
                return "host(" + ed.Addr.String() + ")"
        }
        return ed.Addr.String()
}

// metadata is used to track where components of the generated file have come
// from, in order to generate comments in the file for debug/info. Struct members
// are exported for use by 'text/template'.
type metadata struct {
        SourcePath      string
        Header          string
        NSOverride      bool
        SearchOverride  bool
        OptionsOverride bool
        NDotsFrom       string
        Transform       string
        InvalidNSs      []string
        ExtNameServers  []ExtDNSEntry
        Warnings        []string
}

// Load opens a file at path and parses it as a resolv.conf file.
// On error, the returned ResolvConf will be zero-valued.
func Load(path string) (ResolvConf, error) {
        f, err := os.Open(path)
        if err != nil {
                return ResolvConf{}, err
        }
        defer f.Close()
        return Parse(f, path)
}

// Parse parses a resolv.conf file from reader.
// path is optional if reader is an *os.File.
// On error, the returned ResolvConf will be zero-valued.
func Parse(reader io.Reader, path string) (ResolvConf, error) {
        var rc ResolvConf
        rc.md.SourcePath = path
        if path == "" {
                if namer, ok := reader.(interface{ Name() string }); ok {
                        rc.md.SourcePath = namer.Name()
                }
        }

        scanner := bufio.NewScanner(reader)
        for scanner.Scan() {
                rc.processLine(scanner.Text())
        }
        if err := scanner.Err(); err != nil {
                return ResolvConf{}, systemError{err}
        }
        if _, ok := rc.Option("ndots"); ok {
                rc.md.NDotsFrom = "host"
        }
        return rc, nil
}

// SetHeader sets the content to be included verbatim at the top of the
// generated resolv.conf file. No formatting or checking is done on the
// string. It must be valid resolv.conf syntax. (Comments must have '#'
// or ';' in the first column of each line).
//
// For example:
//
//        SetHeader("# My resolv.conf\n# This file was generated.")
func (rc *ResolvConf) SetHeader(c string) {
        rc.md.Header = c
}

// NameServers returns addresses used in nameserver directives.
func (rc *ResolvConf) NameServers() []netip.Addr {
        return slices.Clone(rc.nameServers)
}

// OverrideNameServers replaces the current set of nameservers.
func (rc *ResolvConf) OverrideNameServers(nameServers []netip.Addr) {
        rc.nameServers = nameServers
        rc.md.NSOverride = true
}

// Search returns the current DNS search domains.
func (rc *ResolvConf) Search() []string {
        return slices.Clone(rc.search)
}

// OverrideSearch replaces the current DNS search domains.
func (rc *ResolvConf) OverrideSearch(search []string) {
        var filtered []string
        for _, s := range search {
                if s != "." {
                        filtered = append(filtered, s)
                }
        }
        rc.search = filtered
        rc.md.SearchOverride = true
}

// Options returns the current options.
func (rc *ResolvConf) Options() []string {
        return slices.Clone(rc.options)
}

// Option finds the last option named search, and returns (value, true) if
// found, else ("", false). Options are treated as "name:value", where the
// ":value" may be omitted.
//
// For example, for "ndots:1 edns0":
//
//        Option("ndots") -> ("1", true)
//        Option("edns0") -> ("", true)
func (rc *ResolvConf) Option(search string) (string, bool) {
        for i := len(rc.options) - 1; i >= 0; i-- {
                k, v, _ := strings.Cut(rc.options[i], ":")
                if k == search {
                        return v, true
                }
        }
        return "", false
}

// OverrideOptions replaces the current DNS options.
func (rc *ResolvConf) OverrideOptions(options []string) {
        rc.options = slices.Clone(options)
        rc.md.NDotsFrom = ""
        if _, exists := rc.Option("ndots"); exists {
                rc.md.NDotsFrom = "override"
        }
        rc.md.OptionsOverride = true
}

// AddOption adds a single DNS option.
func (rc *ResolvConf) AddOption(option string) {
        if len(option) > 6 && option[:6] == "ndots:" {
                rc.md.NDotsFrom = "internal"
        }
        rc.options = append(rc.options, option)
}

// TransformForLegacyNw makes sure the resolv.conf file will be suitable for
// use in a legacy network (one that has no internal resolver).
//   - Remove loopback addresses inherited from the host's resolv.conf, because
//     they'll only work in the host's namespace.
//   - Remove IPv6 addresses if !ipv6.
//   - Add default nameservers if there are no addresses left.
func (rc *ResolvConf) TransformForLegacyNw(ipv6 bool) {
        rc.md.Transform = "legacy"
        if rc.md.NSOverride {
                return
        }
        var filtered []netip.Addr
        for _, addr := range rc.nameServers {
                if !addr.IsLoopback() && (!addr.Is6() || ipv6) {
                        filtered = append(filtered, addr)
                }
        }
        rc.nameServers = filtered
        if len(rc.nameServers) == 0 {
                log.G(context.TODO()).Info("No non-localhost DNS nameservers are left in resolv.conf. Using default external servers")
                rc.nameServers = defaultNSAddrs(ipv6)
                rc.md.Warnings = append(rc.md.Warnings, "Used default nameservers.")
        }
}

// TransformForIntNS makes sure the resolv.conf file will be suitable for
// use in a network sandbox that has an internal DNS resolver.
//   - Add internalNS as a nameserver.
//   - Remove other nameservers, stashing them as ExtNameServers for the
//     internal resolver to use.
//   - Mark ExtNameServers that must be accessed from the host namespace.
//   - If no ExtNameServer addresses are found, use the defaults.
//   - Ensure there's an 'options' value for each entry in reqdOptions. If the
//     option includes a ':', and an option with a matching prefix exists, it
//     is not modified.
func (rc *ResolvConf) TransformForIntNS(
        internalNS netip.Addr,
        reqdOptions []string,
) ([]ExtDNSEntry, error) {
        // Add each of the nameservers read from the host's /etc/hosts or supplied as an
        // override to ExtNameServers, for the internal resolver to talk to. Addresses
        // read from host config should be accessed from the host's network namespace
        // (HostLoopback=true). Addresses supplied as overrides are accessed from the
        // container's namespace.
        rc.md.ExtNameServers = nil
        for _, addr := range rc.nameServers {
                rc.md.ExtNameServers = append(rc.md.ExtNameServers, ExtDNSEntry{
                        Addr:         addr,
                        HostLoopback: !rc.md.NSOverride,
                })
        }

        // The transformed config only lists the internal nameserver.
        rc.nameServers = []netip.Addr{internalNS}

        // For each option required by the nameserver, add it if not already present. If
        // the option is already present, don't override it. Apart from ndots - if the
        // ndots value is invalid and an ndots option is required, replace the existing
        // value.
        for _, opt := range reqdOptions {
                optName, _, _ := strings.Cut(opt, ":")
                if optName == "ndots" {
                        rc.options = removeInvalidNDots(rc.options)
                        // No need to update rc.md.NDotsFrom, if there is no ndots option remaining,
                        // it'll be set to "internal" when the required value is added.
                }
                if _, exists := rc.Option(optName); !exists {
                        rc.AddOption(opt)
                }
        }

        rc.md.Transform = "internal resolver"
        if len(rc.md.ExtNameServers) == 0 {
                rc.md.Warnings = append(rc.md.Warnings, "NO EXTERNAL NAMESERVERS DEFINED")
        }
        return slices.Clone(rc.md.ExtNameServers), nil
}

// Generate returns content suitable for writing to a resolv.conf file. If comments
// is true, the file will include header information if supplied, and a trailing
// comment that describes how the file was constructed and lists external resolvers.
func (rc *ResolvConf) Generate(comments bool) ([]byte, error) {
        var b bytes.Buffer
        b.Grow(512) // estimated size for a regular resolv.conf we produce.

        if comments && rc.md.Header != "" {
                b.WriteString(rc.md.Header + "\n")
                b.WriteByte('\n')
        }
        for _, ns := range rc.nameServers {
                b.WriteString("nameserver ")
                b.WriteString(ns.String())
                b.WriteByte('\n')
        }
        if len(rc.search) > 0 {
                b.WriteString("search ")
                for i, s := range rc.search {
                        if i > 0 {
                                b.WriteByte(' ')
                        }
                        b.WriteString(s)
                }
                b.WriteByte('\n')
        }
        if len(rc.options) > 0 {
                b.WriteString("options ")
                for i, s := range rc.options {
                        if i > 0 {
                                b.WriteByte(' ')
                        }
                        b.WriteString(s)
                }
                b.WriteByte('\n')
        }
        for _, o := range rc.other {
                b.WriteString(o)
                b.WriteByte('\n')
        }

        if comments {
                b.WriteByte('\n')
                b.WriteString("# Based on host file: '" + rc.md.SourcePath + "'")
                if rc.md.Transform != "" {
                        b.WriteString(" (" + rc.md.Transform + ")")
                }
                b.WriteByte('\n')
                for _, w := range rc.md.Warnings {
                        b.WriteString("# ")
                        b.WriteString(w)
                        b.WriteByte('\n')
                }
                if len(rc.md.ExtNameServers) > 0 {
                        b.WriteString("# ExtServers: [")
                        for i, ext := range rc.md.ExtNameServers {
                                if i > 0 {
                                        b.WriteByte(' ')
                                }
                                b.WriteString(ext.String())
                        }
                        b.WriteByte(']')
                        b.WriteByte('\n')
                }
                if len(rc.md.InvalidNSs) > 0 {
                        b.WriteString("# Invalid nameservers: [")
                        for i, ext := range rc.md.InvalidNSs {
                                if i > 0 {
                                        b.WriteByte(' ')
                                }
                                b.WriteString(ext)
                        }
                        b.WriteByte(']')
                        b.WriteByte('\n')
                }

                b.WriteString("# Overrides: [")
                var overrides int
                if rc.md.NSOverride {
                        b.WriteString("nameservers")
                        overrides++
                }
                if rc.md.SearchOverride {
                        if overrides > 0 {
                                b.WriteByte(' ')
                        }
                        b.WriteString("search")
                        overrides++
                }
                if rc.md.OptionsOverride {
                        if overrides > 0 {
                                b.WriteByte(' ')
                        }
                        b.WriteString("options")
                }
                b.WriteByte(']')
                b.WriteByte('\n')

                if rc.md.NDotsFrom != "" {
                        b.WriteString("# Option ndots from: " + rc.md.NDotsFrom + "\n")
                }
        }

        return b.Bytes(), nil
}

// WriteFile generates content and writes it to path. If hashPath is non-zero, it
// also writes a file containing a hash of the content, to enable UserModified()
// to determine whether the file has been modified.
func (rc *ResolvConf) WriteFile(path, hashPath string, perm os.FileMode) error {
        content, err := rc.Generate(true)
        if err != nil {
                return err
        }

        // Write the resolv.conf file - it's bind-mounted into the container, so can't
        // move a temp file into place, just have to truncate and write it.
        if err := os.WriteFile(path, content, perm); err != nil {
                return systemError{err}
        }

        // Write the hash file.
        if hashPath != "" {
                hashFile, err := atomicwriter.New(hashPath, perm)
                if err != nil {
                        return systemError{err}
                }
                defer hashFile.Close()

                if _, err = hashFile.Write([]byte(digest.FromBytes(content))); err != nil {
                        return err
                }
        }

        return nil
}

// UserModified can be used to determine whether the resolv.conf file has been
// modified since it was generated. It returns false with no error if the file
// matches the hash, true with no error if the file no longer matches the hash,
// and false with an error if the result cannot be determined.
func UserModified(rcPath, rcHashPath string) (bool, error) {
        currRCHash, err := os.ReadFile(rcHashPath)
        if err != nil {
                // If the hash file doesn't exist, can only assume it hasn't been written
                // yet (so, the user hasn't modified the file it hashes).
                if errors.Is(err, fs.ErrNotExist) {
                        return false, nil
                }
                return false, errors.Wrapf(err, "failed to read hash file %s", rcHashPath)
        }
        expected, err := digest.Parse(string(currRCHash))
        if err != nil {
                return false, errors.Wrapf(err, "failed to parse hash file %s", rcHashPath)
        }
        v := expected.Verifier()
        currRC, err := os.Open(rcPath)
        if err != nil {
                return false, errors.Wrapf(err, "failed to open %s to check for modifications", rcPath)
        }
        defer currRC.Close()
        if _, err := io.Copy(v, currRC); err != nil {
                return false, errors.Wrapf(err, "failed to hash %s to check for modifications", rcPath)
        }
        return !v.Verified(), nil
}

func (rc *ResolvConf) processLine(line string) {
        // Strip blank lines and comments.
        if line == "" || line[0] == '#' || line[0] == ';' {
                return
        }

        fields := strings.Fields(line)
        if len(fields) == 0 {
                return
        }

        switch fields[0] {
        case "nameserver":
                if len(fields) < 2 {
                        return
                }
                if addr, err := netip.ParseAddr(fields[1]); err != nil {
                        rc.md.InvalidNSs = append(rc.md.InvalidNSs, fields[1])
                } else {
                        rc.nameServers = append(rc.nameServers, addr)
                }
        case "domain":
                // 'domain' is an obsolete name for 'search'.
                fallthrough
        case "search":
                if len(fields) < 2 {
                        return
                }
                // Only the last 'search' directive is used.
                rc.search = fields[1:]
        case "options":
                if len(fields) < 2 {
                        return
                }
                // Accumulate options.
                rc.options = append(rc.options, fields[1:]...)
        default:
                // Copy anything that's not a recognised directive.
                rc.other = append(rc.other, line)
        }
}

func defaultNSAddrs(ipv6 bool) []netip.Addr {
        var addrs []netip.Addr
        addrs = append(addrs, defaultIPv4NSs...)
        if ipv6 {
                addrs = append(addrs, defaultIPv6NSs...)
        }
        return addrs
}

// removeInvalidNDots filters ill-formed "ndots" settings from options.
// The backing array of the options slice is reused.
func removeInvalidNDots(options []string) []string {
        n := 0
        for _, opt := range options {
                k, v, hasSep := strings.Cut(opt, ":")
                if k == "ndots" {
                        if !hasSep || v == "" {
                                continue
                        }
                        ndots, err := strconv.Atoi(v)
                        if err != nil || ndots < 0 {
                                continue
                        }
                }
                options[n] = opt
                n++
        }
        clear(options[n:]) // Zero out the obsolete elements, for GC.
        return options[:n]
}

// systemError implements [github.com/docker/docker/errdefs.ErrSystem].
type systemError struct{ error }

func (systemError) System() {}

func (e systemError) Unwrap() error {
        return e.error
}

package resolvconf

import (
        "context"
        "net/netip"
        "sync"

        "github.com/containerd/log"
)

const (
        // defaultPath is the default path to the resolv.conf that contains information to resolve DNS. See Path().
        defaultPath = "/etc/resolv.conf"
        // alternatePath is a path different from defaultPath, that may be used to resolve DNS. See Path().
        alternatePath = "/run/systemd/resolve/resolv.conf"
)

// For Path to detect systemd (only needed for legacy networking).
var (
        detectSystemdResolvConfOnce sync.Once
        pathAfterSystemdDetection   = defaultPath
)

// Path returns the path to the resolv.conf file that libnetwork should use.
//
// When /etc/resolv.conf contains 127.0.0.53 as the only nameserver, then
// it is assumed systemd-resolved manages DNS. Because inside the container 127.0.0.53
// is not a valid DNS server, Path() returns /run/systemd/resolve/resolv.conf
// which is the resolv.conf that systemd-resolved generates and manages.
// Otherwise Path() returns /etc/resolv.conf.
//
// Errors are silenced as they will inevitably resurface at future open/read calls.
//
// More information at https://www.freedesktop.org/software/systemd/man/systemd-resolved.service.html#/etc/resolv.conf
//
// TODO(robmry) - alternatePath is only needed for legacy networking ...
//
//        Host networking can use the host's resolv.conf as-is, and with an internal
//        resolver it's also possible to use nameservers on the host's loopback
//        interface. Once legacy networking is removed, this can always return
//        defaultPath.
func Path() string {
        detectSystemdResolvConfOnce.Do(func() {
                rc, err := Load(defaultPath)
                if err != nil {
                        // silencing error as it will resurface at next calls trying to read defaultPath
                        return
                }
                ns := rc.nameServers
                if len(ns) == 1 && ns[0] == netip.MustParseAddr("127.0.0.53") {
                        pathAfterSystemdDetection = alternatePath
                        log.G(context.TODO()).Infof("detected 127.0.0.53 nameserver, assuming systemd-resolved, so using resolv.conf: %s", alternatePath)
                }
        })
        return pathAfterSystemdDetection
}

// RootlessKit integration - if required by RootlessKit's port driver, let it know
// about port mappings as they're added and removed.
//
// This is based on / copied from rootlesskit-docker-proxy, which was previously
// installed as a proxy for docker-proxy:
// https://github.com/rootless-containers/rootlesskit/blob/4fb2e2cb80bf13eb28b7f2a4317b63406b89ad32/cmd/rootlesskit-docker-proxy/main.go

package rlkclient

import (
        "context"
        "errors"
        "fmt"
        "net/netip"
        "os"
        "path/filepath"
        "strings"

        "github.com/rootless-containers/rootlesskit/v2/pkg/api/client"
        "github.com/rootless-containers/rootlesskit/v2/pkg/port"
)

type PortDriverClient struct {
        client         client.Client
        portDriverName string
        protos         map[string]struct{}
        childIP        netip.Addr
}

func NewPortDriverClient(ctx context.Context) (*PortDriverClient, error) {
        stateDir := os.Getenv("ROOTLESSKIT_STATE_DIR")
        if stateDir == "" {
                return nil, errors.New("$ROOTLESSKIT_STATE_DIR needs to be set")
        }
        socketPath := filepath.Join(stateDir, "api.sock")
        c, err := client.New(socketPath)
        if err != nil {
                return nil, fmt.Errorf("error while connecting to RootlessKit API socket: %w", err)
        }

        info, err := c.Info(ctx)
        if err != nil {
                return nil, fmt.Errorf("failed to call info API, probably RootlessKit binary is too old (needs to be v0.14.0 or later): %w", err)
        }

        // info.PortDriver is currently nil for "none" and "implicit", but this may change in future
        if info.PortDriver == nil || info.PortDriver.Driver == "none" || info.PortDriver.Driver == "implicit" {
                return nil, nil
        }

        pdc := &PortDriverClient{
                client:         c,
                portDriverName: info.PortDriver.Driver,
        }

        if info.PortDriver.DisallowLoopbackChildIP {
                // i.e., port-driver="slirp4netns"
                if info.NetworkDriver.ChildIP == nil {
                        return nil, fmt.Errorf("RootlessKit port driver (%q) does not allow loopback child IP, but network driver (%q) has no non-loopback IP",
                                info.PortDriver.Driver, info.NetworkDriver.Driver)
                }
                childIP, ok := netip.AddrFromSlice(info.NetworkDriver.ChildIP)
                if !ok {
                        return nil, fmt.Errorf("unable to use child IP %s from network driver (%q)",
                                info.NetworkDriver.ChildIP, info.NetworkDriver.Driver)
                }
                pdc.childIP = childIP
        }

        pdc.protos = make(map[string]struct{}, len(info.PortDriver.Protos))
        for _, p := range info.PortDriver.Protos {
                pdc.protos[p] = struct{}{}
        }

        return pdc, nil
}

// ChildHostIP returns the address that must be used in the child network
// namespace in place of hostIP, a host IP address. In particular, port
// mappings from host IP addresses, and DNAT rules, must use this child
// address in place of the real host address.
func (c *PortDriverClient) ChildHostIP(hostIP netip.Addr) netip.Addr {
        if c == nil {
                return hostIP
        }
        if c.childIP.IsValid() {
                return c.childIP
        }
        if hostIP.Is6() {
                return netip.IPv6Loopback()
        }
        return netip.MustParseAddr("127.0.0.1")
}

// ProtocolUnsupportedError is returned when apiProto is not supported by portDriverName.
type ProtocolUnsupportedError struct {
        apiProto       string
        portDriverName string
}

func (e *ProtocolUnsupportedError) Error() string {
        return fmt.Sprintf("protocol %q is not supported by the RootlessKit port driver %q",
                e.apiProto, e.portDriverName)
}

// AddPort makes a request to RootlessKit asking it to set up a port
// mapping between a host IP address and a child host IP address.
//
// AddPort may return [ProtocolUnsupportedError].
func (c *PortDriverClient) AddPort(
        ctx context.Context,
        proto string,
        hostIP netip.Addr,
        childIP netip.Addr,
        hostPort int,
) (func() error, error) {
        if c == nil {
                return func() error { return nil }, nil
        }
        // proto is like "tcp", but we need to convert it to "tcp4" or "tcp6" explicitly
        // for libnetwork >= 20201216
        //
        // See https://github.com/moby/libnetwork/pull/2604/files#diff-8fa48beed55dd033bf8e4f8c40b31cf69d0b2cc5d4bb53cde8594670ea6c938aR20
        // See also https://github.com/rootless-containers/rootlesskit/issues/231
        apiProto := proto
        if !strings.HasSuffix(apiProto, "4") && !strings.HasSuffix(apiProto, "6") {
                if hostIP.Is6() {
                        apiProto += "6"
                } else {
                        apiProto += "4"
                }
        }

        if _, ok := c.protos[apiProto]; !ok {
                // This happens when apiProto="tcp6", portDriverName="slirp4netns",
                // because "slirp4netns" port driver does not support listening on IPv6 yet.
                //
                // Note that "slirp4netns" port driver is not used by default,
                // even when network driver is set to "slirp4netns".
                //
                // Most users are using "builtin" port driver and will not see this warning.
                err := &ProtocolUnsupportedError{
                        apiProto:       apiProto,
                        portDriverName: c.portDriverName,
                }
                return nil, err
        }

        pm := c.client.PortManager()
        p := port.Spec{
                Proto:      apiProto,
                ParentIP:   hostIP.String(),
                ParentPort: hostPort,
                ChildIP:    childIP.String(),
                ChildPort:  hostPort,
        }
        st, err := pm.AddPort(ctx, p)
        if err != nil {
                return nil, fmt.Errorf("error while calling RootlessKit PortManager.AddPort(): %w", err)
        }
        deferFunc := func() error {
                if dErr := pm.RemovePort(context.WithoutCancel(ctx), st.ID); dErr != nil {
                        return fmt.Errorf("error while calling RootlessKit PortManager.RemovePort(): %w", dErr)
                }
                return nil
        }
        return deferFunc, nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package setmatrix

import (
        "sync"

        mapset "github.com/deckarep/golang-set/v2"
)

// SetMatrix is a map of Sets.
// The zero value is an empty set matrix ready to use.
//
// SetMatrix values are safe for concurrent use.
type SetMatrix[K, V comparable] struct {
        matrix map[K]mapset.Set[V]

        mu sync.Mutex
}

// Get returns the members of the set for a specific key as a slice.
func (s *SetMatrix[K, V]) Get(key K) ([]V, bool) {
        s.mu.Lock()
        defer s.mu.Unlock()
        set, ok := s.matrix[key]
        if !ok {
                return nil, ok
        }
        return set.ToSlice(), ok
}

// Contains is used to verify if an element is in a set for a specific key.
func (s *SetMatrix[K, V]) Contains(key K, value V) (containsElement, setExists bool) {
        s.mu.Lock()
        defer s.mu.Unlock()
        set, ok := s.matrix[key]
        if !ok {
                return false, ok
        }
        return set.Contains(value), ok
}

// Insert inserts the value in the set of a key and returns whether the value is
// inserted (was not already in the set) and the number of elements in the set.
func (s *SetMatrix[K, V]) Insert(key K, value V) (inserted bool, cardinality int) {
        s.mu.Lock()
        defer s.mu.Unlock()
        set, ok := s.matrix[key]
        if !ok {
                if s.matrix == nil {
                        s.matrix = make(map[K]mapset.Set[V])
                }
                s.matrix[key] = mapset.NewThreadUnsafeSet(value)
                return true, 1
        }

        return set.Add(value), set.Cardinality()
}

// Remove removes the value in the set for a specific key.
func (s *SetMatrix[K, V]) Remove(key K, value V) (removed bool, cardinality int) {
        s.mu.Lock()
        defer s.mu.Unlock()
        set, ok := s.matrix[key]
        if !ok {
                return false, 0
        }

        if set.Contains(value) {
                set.Remove(value)
                removed = true
                // If the set is empty remove it from the matrix
                if set.Cardinality() == 0 {
                        delete(s.matrix, key)
                }
        }

        return removed, set.Cardinality()
}

// Cardinality returns the number of elements in the set for a key.
func (s *SetMatrix[K, V]) Cardinality(key K) (cardinality int, ok bool) {
        s.mu.Lock()
        defer s.mu.Unlock()
        set, ok := s.matrix[key]
        if !ok {
                return 0, ok
        }

        return set.Cardinality(), ok
}

// String returns the string version of the set.
// The empty string is returned if there is no set for key.
func (s *SetMatrix[K, V]) String(key K) (v string, ok bool) {
        s.mu.Lock()
        defer s.mu.Unlock()
        set, ok := s.matrix[key]
        if !ok {
                return "", ok
        }
        return set.String(), ok
}

// Keys returns all the keys in the map.
func (s *SetMatrix[K, V]) Keys() []K {
        s.mu.Lock()
        defer s.mu.Unlock()
        keys := make([]K, 0, len(s.matrix))
        for k := range s.matrix {
                keys = append(keys, k)
        }
        return keys
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package defaultipam

import (
        "context"
        "net/netip"
        "slices"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/ipamutils"
        "github.com/docker/docker/daemon/libnetwork/ipbits"
        "github.com/docker/docker/daemon/libnetwork/types"
)

// addrSpace contains the pool configurations for the address space
type addrSpace struct {
        // Ordered list of allocated subnets. This field is used for dynamic subnet
        // allocations.
        allocated []netip.Prefix
        // Allocated subnets, indexed by their prefix. Values track address
        // allocations.
        subnets map[netip.Prefix]*PoolData

        // predefined pools for the address space
        predefined []*ipamutils.NetworkToSplit

        mu sync.Mutex
}

func newAddrSpace(predefined []*ipamutils.NetworkToSplit) (*addrSpace, error) {
        slices.SortFunc(predefined, func(a, b *ipamutils.NetworkToSplit) int {
                return netiputil.PrefixCompare(a.Base, b.Base)
        })

        // We need to discard longer overlapping prefixes (sorted after the shorter
        // one), otherwise the dynamic allocator might consider a predefined
        // network is fully overlapped, go to the next one, which is a subnet of
        // the previous, and allocate from it.
        j := 0
        for i := 1; i < len(predefined); i++ {
                if predefined[j].Overlaps(predefined[i].Base) {
                        continue
                }
                j++
                predefined[j] = predefined[i]
        }

        if len(predefined) > j {
                j++
        }
        clear(predefined[j:])

        return &addrSpace{
                subnets:    map[netip.Prefix]*PoolData{},
                predefined: predefined[:j],
        }, nil
}

// allocateSubnet makes a static allocation for subnets 'nw' and 'sub'.
//
// This method is safe for concurrent use.
func (aSpace *addrSpace) allocateSubnet(nw, sub netip.Prefix) error {
        aSpace.mu.Lock()
        defer aSpace.mu.Unlock()

        // Check if already allocated
        if pool, ok := aSpace.subnets[nw]; ok {
                var childExists bool
                if sub != (netip.Prefix{}) {
                        _, childExists = pool.children[sub]
                }
                if sub == (netip.Prefix{}) || childExists {
                        // This means the same pool is already allocated. allocateSubnet is called when there
                        // is request for a pool/subpool. It should ensure there is no overlap with existing pools
                        return ipamapi.ErrPoolOverlap
                }
        }

        return aSpace.allocateSubnetL(nw, sub)
}

// allocateSubnetL takes a 'nw' parent prefix and a 'sub' prefix. These are
// '--subnet' and '--ip-range' on the CLI.
//
// If 'sub' prefix is specified, we don't check if 'parent' overlaps with
// existing allocations. However, if no 'sub' prefix is specified, we do check
// for overlaps. This behavior is weird and leads to the inconsistencies
// documented in https://github.com/moby/moby/issues/46756.
func (aSpace *addrSpace) allocateSubnetL(nw, sub netip.Prefix) error {
        // If master pool, check for overlap
        if sub == (netip.Prefix{}) {
                if aSpace.overlaps(nw) {
                        return ipamapi.ErrPoolOverlap
                }
                return aSpace.allocatePool(nw)
        }

        // Look for parent pool
        _, ok := aSpace.subnets[nw]
        if !ok {
                if err := aSpace.allocatePool(nw); err != nil {
                        return err
                }
                aSpace.subnets[nw].autoRelease = true
        }
        aSpace.subnets[nw].children[sub] = struct{}{}
        return nil
}

// overlaps reports whether nw contains any IP addresses in common with any of
// the existing subnets in this address space.
func (aSpace *addrSpace) overlaps(nw netip.Prefix) bool {
        for _, allocated := range aSpace.allocated {
                if allocated.Overlaps(nw) {
                        return true
                }
        }
        return false
}

func (aSpace *addrSpace) allocatePool(nw netip.Prefix) error {
        n, _ := slices.BinarySearchFunc(aSpace.allocated, nw, netiputil.PrefixCompare)
        aSpace.allocated = slices.Insert(aSpace.allocated, n, nw)
        aSpace.subnets[nw] = newPoolData(nw)
        return nil
}

// allocatePredefinedPool dynamically allocates a subnet that doesn't overlap
// with existing allocations and 'reserved' prefixes.
//
// This method is safe for concurrent use.
func (aSpace *addrSpace) allocatePredefinedPool(reserved []netip.Prefix) (netip.Prefix, error) {
        aSpace.mu.Lock()
        defer aSpace.mu.Unlock()

        var pdfID int
        var partialOverlap bool
        var prevAlloc netip.Prefix

        it := newMergeIter(aSpace.allocated, reserved, netiputil.PrefixCompare)

        makeAlloc := func(subnet netip.Prefix) netip.Prefix {
                // it.ia tracks the position of the mergeIter within aSpace.allocated.
                aSpace.allocated = slices.Insert(aSpace.allocated, it.ia, subnet)
                aSpace.subnets[subnet] = newPoolData(subnet)
                return subnet
        }

        for {
                allocated := it.Get()
                if allocated == (netip.Prefix{}) {
                        // We reached the end of both 'aSpace.allocated' and 'reserved'.
                        break
                }

                if pdfID >= len(aSpace.predefined) {
                        return netip.Prefix{}, ipamapi.ErrNoMoreSubnets
                }
                pdf := aSpace.predefined[pdfID]

                if allocated.Overlaps(pdf.Base) {
                        if allocated.Bits() <= pdf.Base.Bits() {
                                // The current 'allocated' prefix is bigger than the 'pdf'
                                // network, thus the block is fully overlapped.
                                partialOverlap = false
                                prevAlloc = netip.Prefix{}
                                pdfID++
                                continue
                        }

                        // If no previous 'allocated' was found to partially overlap 'pdf',
                        // we need to test whether there's enough space available at the
                        // beginning of 'pdf'.
                        if !partialOverlap && ipbits.SubnetsBetween(pdf.FirstPrefix().Addr(), allocated.Addr(), pdf.Size) >= 1 {
                                // Okay, so there's at least a whole subnet available between
                                // the start of 'pdf' and 'allocated'.
                                next := pdf.FirstPrefix()
                                return makeAlloc(next), nil
                        }

                        // If the network 'pdf' was already found to be partially
                        // overlapped, we need to test whether there's enough space between
                        // the end of 'prevAlloc' and current 'allocated'.
                        afterPrev := netiputil.PrefixAfter(prevAlloc, pdf.Size)
                        if partialOverlap && ipbits.SubnetsBetween(afterPrev.Addr(), allocated.Addr(), pdf.Size) >= 1 {
                                // Okay, so there's at least a whole subnet available after
                                // 'prevAlloc' and before 'allocated'.
                                return makeAlloc(afterPrev), nil
                        }

                        it.Inc()

                        if netiputil.LastAddr(allocated) == netiputil.LastAddr(pdf.Base) {
                                // The last address of the current 'allocated' prefix is the
                                // same as the last address of the 'pdf' network, it's fully
                                // overlapped.
                                partialOverlap = false
                                prevAlloc = netip.Prefix{}
                                pdfID++
                                continue
                        }

                        // This 'pdf' network is partially overlapped.
                        partialOverlap = true
                        prevAlloc = allocated
                        continue
                }

                // Okay, so previous 'allocated' overlapped and current doesn't. Now
                // the question is: is there enough space left between previous
                // 'allocated' and the end of the 'pdf' network?
                if partialOverlap {
                        partialOverlap = false

                        if next := netiputil.PrefixAfter(prevAlloc, pdf.Size); pdf.Overlaps(next) {
                                return makeAlloc(next), nil
                        }

                        // No luck, PrefixAfter yielded an invalid prefix. There's not
                        // enough space left to subnet it once more.
                        pdfID++

                        // 'it' is not incremented here, we need to re-test the current
                        // 'allocated' against the next 'pdf' network.
                        continue
                }

                // If the network 'pdf' doesn't overlap and is sorted before the
                // current 'allocated', we found the right spot.
                if pdf.Base.Addr().Less(allocated.Addr()) {
                        next := netip.PrefixFrom(pdf.Base.Addr(), pdf.Size)
                        return makeAlloc(next), nil
                }

                it.Inc()
                prevAlloc = allocated
        }

        if pdfID >= len(aSpace.predefined) {
                return netip.Prefix{}, ipamapi.ErrNoMoreSubnets
        }

        // We reached the end of 'allocated', but not the end of predefined
        // networks. Let's try two more times (once on the current 'pdf', and once
        // on the next network if any).
        if partialOverlap {
                pdf := aSpace.predefined[pdfID]

                if next := netiputil.PrefixAfter(prevAlloc, pdf.Size); pdf.Overlaps(next) {
                        return makeAlloc(next), nil
                }

                // No luck -- PrefixAfter yielded an invalid prefix. There's not enough
                // space left.
                pdfID++
        }

        // One last chance. Here we don't increment pdfID since the last iteration
        // on 'it' found either:
        //
        // - A full overlap, and incremented 'pdfID'.
        // - A partial overlap, and the previous 'if' incremented 'pdfID'.
        // - The current 'pdfID' comes after the last 'allocated' -- it's not
        //   overlapped at all.
        //
        // Hence, we're sure 'pdfID' has never been subnetted yet.
        if pdfID < len(aSpace.predefined) {
                pdf := aSpace.predefined[pdfID]

                next := pdf.FirstPrefix()
                return makeAlloc(next), nil
        }

        return netip.Prefix{}, ipamapi.ErrNoMoreSubnets
}

// releaseSubnet deallocates prefixes nw and sub. It returns an error if no
// matching allocations could be found.
//
// This method is safe for concurrent use.
func (aSpace *addrSpace) releaseSubnet(nw, sub netip.Prefix) error {
        aSpace.mu.Lock()
        defer aSpace.mu.Unlock()

        p, ok := aSpace.subnets[nw]
        if !ok {
                return ipamapi.ErrBadPool
        }

        if sub != (netip.Prefix{}) {
                if _, ok := p.children[sub]; !ok {
                        return ipamapi.ErrBadPool
                }
                delete(p.children, sub)
        } else {
                p.autoRelease = true
        }

        if len(p.children) == 0 && p.autoRelease {
                aSpace.deallocate(nw)
        }

        return nil
}

// deallocate removes 'nw' from the list of allocations.
func (aSpace *addrSpace) deallocate(nw netip.Prefix) {
        if i, ok := slices.BinarySearchFunc(aSpace.allocated, nw, netiputil.PrefixCompare); ok {
                aSpace.allocated = slices.Delete(aSpace.allocated, i, i+1)
                delete(aSpace.subnets, nw)
        }
}

func (aSpace *addrSpace) requestAddress(nw, sub netip.Prefix, prefAddress netip.Addr, opts map[string]string) (netip.Addr, error) {
        aSpace.mu.Lock()
        defer aSpace.mu.Unlock()

        p, ok := aSpace.subnets[nw]
        if !ok {
                return netip.Addr{}, types.NotFoundErrorf("cannot find address pool for poolID:%v/%v", nw, sub)
        }

        if prefAddress != (netip.Addr{}) && !nw.Contains(prefAddress) {
                return netip.Addr{}, ipamapi.ErrIPOutOfRange
        }

        if sub != (netip.Prefix{}) {
                if _, ok := p.children[sub]; !ok {
                        return netip.Addr{}, types.NotFoundErrorf("cannot find address pool for poolID:%v/%v", nw, sub)
                }
        }

        // In order to request for a serial ip address allocation, callers can pass in the option to request
        // IP allocation serially or first available IP in the subnet
        serial := opts[ipamapi.AllocSerialPrefix] == "true"
        ip, err := getAddress(nw, p.addrs, prefAddress, sub, serial)
        if err != nil {
                return netip.Addr{}, err
        }

        return ip, nil
}

func (aSpace *addrSpace) releaseAddress(nw, sub netip.Prefix, address netip.Addr) error {
        aSpace.mu.Lock()
        defer aSpace.mu.Unlock()

        p, ok := aSpace.subnets[nw]
        if !ok {
                return types.NotFoundErrorf("cannot find address pool for %v/%v", nw, sub)
        }
        if sub != (netip.Prefix{}) {
                if _, ok := p.children[sub]; !ok {
                        return types.NotFoundErrorf("cannot find address pool for poolID:%v/%v", nw, sub)
                }
        }

        if !address.IsValid() {
                return types.InvalidParameterErrorf("invalid address")
        }

        if !nw.Contains(address) {
                return ipamapi.ErrIPOutOfRange
        }

        defer log.G(context.TODO()).Debugf("Released address Address:%v Sequence:%s", address, p.addrs)

        return p.addrs.Remove(address)
}

package defaultipam

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/netip"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/addrset"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/ipamutils"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        // DriverName is the name of the built-in default IPAM driver.
        DriverName = "default"

        localAddressSpace  = "LocalDefault"
        globalAddressSpace = "GlobalDefault"
)

// Register registers the default ipam driver with libnetwork. It takes
// two optional address pools respectively containing the list of user-defined
// address pools for 'local' and 'global' address spaces.
func Register(ic ipamapi.Registerer, lAddrPools, gAddrPools []*ipamutils.NetworkToSplit) error {
        if len(gAddrPools) == 0 {
                gAddrPools = ipamutils.GetGlobalScopeDefaultNetworks()
        }

        a, err := NewAllocator(lAddrPools, gAddrPools)
        if err != nil {
                return err
        }

        cps := &ipamapi.Capability{RequiresRequestReplay: true}

        return ic.RegisterIpamDriverWithCapabilities(DriverName, a, cps)
}

// Allocator provides per address space ipv4/ipv6 bookkeeping
type Allocator struct {
        // The address spaces
        local4, local6, global4, global6 *addrSpace
}

// NewAllocator returns an instance of libnetwork ipam
func NewAllocator(lcAs, glAs []*ipamutils.NetworkToSplit) (*Allocator, error) {
        var (
                a                          Allocator
                err                        error
                lcAs4, lcAs6, glAs4, glAs6 []*ipamutils.NetworkToSplit
        )

        lcAs4, lcAs6, err = splitByIPFamily(lcAs)
        if err != nil {
                return nil, fmt.Errorf("could not construct local address space: %w", err)
        }

        glAs4, glAs6, err = splitByIPFamily(glAs)
        if err != nil {
                return nil, fmt.Errorf("could not construct global address space: %w", err)
        }

        a.local4, err = newAddrSpace(lcAs4)
        if err != nil {
                return nil, fmt.Errorf("could not construct local v4 address space: %w", err)
        }
        a.local6, err = newAddrSpace(lcAs6)
        if err != nil {
                return nil, fmt.Errorf("could not construct local v6 address space: %w", err)
        }
        a.global4, err = newAddrSpace(glAs4)
        if err != nil {
                return nil, fmt.Errorf("could not construct global v4 address space: %w", err)
        }
        a.global6, err = newAddrSpace(glAs6)
        if err != nil {
                return nil, fmt.Errorf("could not construct global v6 address space: %w", err)
        }
        return &a, nil
}

func splitByIPFamily(s []*ipamutils.NetworkToSplit) ([]*ipamutils.NetworkToSplit, []*ipamutils.NetworkToSplit, error) {
        var v4, v6 []*ipamutils.NetworkToSplit

        for i, n := range s {
                if !n.Base.IsValid() || n.Size == 0 {
                        return []*ipamutils.NetworkToSplit{}, []*ipamutils.NetworkToSplit{}, fmt.Errorf("network at index %d (%v) is not in canonical form", i, n)
                }
                if n.Base.Bits() > n.Size {
                        return []*ipamutils.NetworkToSplit{}, []*ipamutils.NetworkToSplit{}, fmt.Errorf("network at index %d (%v) has a smaller prefix (/%d) than the target size of that pool (/%d)", i, n, n.Base.Bits(), n.Size)
                }

                n.Base, _ = n.Base.Addr().Unmap().Prefix(n.Base.Bits())

                if n.Base.Addr().Is4() {
                        v4 = append(v4, n)
                } else {
                        v6 = append(v6, n)
                }
        }

        return v4, v6, nil
}

// GetDefaultAddressSpaces returns the local and global default address spaces
func (a *Allocator) GetDefaultAddressSpaces() (string, string, error) {
        return localAddressSpace, globalAddressSpace, nil
}

// RequestPool returns an address pool along with its unique id.
// addressSpace must be a valid address space name and must not be the empty string.
// If requestedPool is the empty string then the default predefined pool for addressSpace will be used, otherwise pool must be a valid IP address and length in CIDR notation.
// If requestedSubPool is not empty, it must be a valid IP address and length in CIDR notation which is a sub-range of requestedPool.
// requestedSubPool must be empty if requestedPool is empty.
func (a *Allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, error) {
        log.G(context.TODO()).Debugf("RequestPool: %+v", req)

        parseErr := func(err error) error {
                return types.InternalErrorf("failed to parse pool request for address space %q pool %q subpool %q: %v", req.AddressSpace, req.Pool, req.SubPool, err)
        }

        if req.AddressSpace == "" {
                return ipamapi.AllocatedPool{}, parseErr(ipamapi.ErrInvalidAddressSpace)
        }
        aSpace, err := a.getAddrSpace(req.AddressSpace, req.V6)
        if err != nil {
                return ipamapi.AllocatedPool{}, err
        }
        if req.Pool == "" && req.SubPool != "" {
                return ipamapi.AllocatedPool{}, parseErr(ipamapi.ErrInvalidSubPool)
        }

        k := PoolID{AddressSpace: req.AddressSpace}
        if req.Pool == "" {
                if k.Subnet, err = aSpace.allocatePredefinedPool(req.Exclude); err != nil {
                        return ipamapi.AllocatedPool{}, err
                }
                return ipamapi.AllocatedPool{PoolID: k.String(), Pool: k.Subnet}, nil
        }

        if k.Subnet, err = netip.ParsePrefix(req.Pool); err != nil {
                return ipamapi.AllocatedPool{}, parseErr(ipamapi.ErrInvalidPool)
        }

        if req.SubPool != "" {
                if k.ChildSubnet, err = netip.ParsePrefix(req.SubPool); err != nil {
                        return ipamapi.AllocatedPool{}, types.InternalErrorf("invalid pool request: %v", ipamapi.ErrInvalidSubPool)
                }
        }

        // This is a new non-master pool (subPool)
        if k.Subnet.IsValid() && k.ChildSubnet.IsValid() && k.Subnet.Addr().BitLen() != k.ChildSubnet.Addr().BitLen() {
                return ipamapi.AllocatedPool{}, types.InvalidParameterErrorf("pool and subpool are of incompatible address families")
        }

        k.Subnet, k.ChildSubnet = k.Subnet.Masked(), k.ChildSubnet.Masked()
        // Prior to https://github.com/moby/moby/pull/44968, libnetwork would happily accept a ChildSubnet with a bigger
        // mask than its parent subnet. In such case, it was producing IP addresses based on the parent subnet, and the
        // child subnet was not allocated from the address pool. Following condition take care of restoring this behavior
        // for networks created before upgrading to v24.0.
        if k.ChildSubnet.IsValid() && k.ChildSubnet.Bits() < k.Subnet.Bits() {
                k.ChildSubnet = k.Subnet
        }

        err = aSpace.allocateSubnet(k.Subnet, k.ChildSubnet)
        if err != nil {
                return ipamapi.AllocatedPool{}, types.ForbiddenErrorf("invalid pool request: %v", err)
        }

        return ipamapi.AllocatedPool{PoolID: k.String(), Pool: k.Subnet}, nil
}

// ReleasePool releases the address pool identified by the passed id
func (a *Allocator) ReleasePool(poolID string) error {
        log.G(context.TODO()).Debugf("ReleasePool(%s)", poolID)
        k, err := PoolIDFromString(poolID)
        if err != nil {
                return types.InvalidParameterErrorf("invalid pool id: %s", poolID)
        }

        aSpace, err := a.getAddrSpace(k.AddressSpace, k.Is6())
        if err != nil {
                return err
        }

        return aSpace.releaseSubnet(k.Subnet, k.ChildSubnet)
}

// Given the address space, returns the local or global PoolConfig based on whether the
// address space is local or global. AddressSpace locality is registered with IPAM out of band.
func (a *Allocator) getAddrSpace(as string, v6 bool) (*addrSpace, error) {
        switch as {
        case localAddressSpace:
                if v6 {
                        return a.local6, nil
                }
                return a.local4, nil
        case globalAddressSpace:
                if v6 {
                        return a.global6, nil
                }
                return a.global4, nil
        }
        return nil, types.InvalidParameterErrorf("cannot find address space %s", as)
}

func newPoolData(pool netip.Prefix) *PoolData {
        h := addrset.New(pool)

        // Reserve the first address in the range for the:
        // - IPv4 network address
        //   - Except in a /31 point-to-point link, https://datatracker.ietf.org/doc/html/rfc3021
        // - IPv6 Subnet-Router anycast address, https://datatracker.ietf.org/doc/html/rfc4291#section-2.6.1
        bits := pool.Addr().BitLen() - pool.Bits()
        if !pool.Addr().Is4() || bits > 1 {
                h.Add(pool.Addr())
        }

        // For IPv4, reserve the broadcast address.
        // - Except in a /31 point-to-point link, https://datatracker.ietf.org/doc/html/rfc3021
        if pool.Addr().Is4() && bits > 1 {
                h.Add(netiputil.LastAddr(pool))
        }

        return &PoolData{addrs: h, children: map[netip.Prefix]struct{}{}}
}

// RequestAddress returns an address from the specified pool ID
func (a *Allocator) RequestAddress(poolID string, prefAddress net.IP, opts map[string]string) (*net.IPNet, map[string]string, error) {
        log.G(context.TODO()).Debugf("RequestAddress(%s, %v, %v)", poolID, prefAddress, opts)
        k, err := PoolIDFromString(poolID)
        if err != nil {
                return nil, nil, types.InvalidParameterErrorf("invalid pool id: %s", poolID)
        }

        aSpace, err := a.getAddrSpace(k.AddressSpace, k.Is6())
        if err != nil {
                return nil, nil, err
        }
        var pref netip.Addr
        if prefAddress != nil {
                var ok bool
                pref, ok = netip.AddrFromSlice(prefAddress)
                if !ok {
                        return nil, nil, types.InvalidParameterErrorf("invalid preferred address: %v", prefAddress)
                }
        }
        p, err := aSpace.requestAddress(k.Subnet, k.ChildSubnet, pref.Unmap(), opts)
        if err != nil {
                return nil, nil, err
        }
        return &net.IPNet{
                IP:   p.AsSlice(),
                Mask: net.CIDRMask(k.Subnet.Bits(), k.Subnet.Addr().BitLen()),
        }, nil, nil
}

// ReleaseAddress releases the address from the specified pool ID
func (a *Allocator) ReleaseAddress(poolID string, address net.IP) error {
        log.G(context.TODO()).Debugf("ReleaseAddress(%s, %v)", poolID, address)
        k, err := PoolIDFromString(poolID)
        if err != nil {
                return types.InvalidParameterErrorf("invalid pool id: %s", poolID)
        }

        aSpace, err := a.getAddrSpace(k.AddressSpace, k.Is6())
        if err != nil {
                return err
        }

        addr, ok := netip.AddrFromSlice(address)
        if !ok {
                return types.InvalidParameterErrorf("invalid address: %v", address)
        }

        return aSpace.releaseAddress(k.Subnet, k.ChildSubnet, addr.Unmap())
}

func getAddress(base netip.Prefix, addrSet *addrset.AddrSet, prefAddress netip.Addr, ipr netip.Prefix, serial bool) (netip.Addr, error) {
        var (
                addr netip.Addr
                err  error
        )

        log.G(context.TODO()).Debugf("Request address PoolID:%v %s Serial:%v PrefAddress:%v ", base, addrSet, serial, prefAddress)

        if prefAddress.IsValid() {
                err = addrSet.Add(prefAddress)
                if err == nil {
                        addr = prefAddress
                }
        } else if ipr.IsValid() && ipr != base {
                addr, err = addrSet.AddAnyInRange(ipr, serial)
        } else {
                addr, err = addrSet.AddAny(serial)
        }

        if err != nil {
                if errors.Is(err, addrset.ErrAllocated) {
                        return netip.Addr{}, ipamapi.ErrIPAlreadyAllocated
                }
                if errors.Is(err, addrset.ErrNotAvailable) {
                        return netip.Addr{}, ipamapi.ErrNoAvailableIPs
                }
                return netip.Addr{}, err
        }
        return addr, nil
}

// IsBuiltIn returns true for builtin drivers
func (a *Allocator) IsBuiltIn() bool {
        return true
}

package defaultipam

import (
        "fmt"
        "net/netip"
        "strings"

        "github.com/docker/docker/daemon/libnetwork/internal/addrset"
        "github.com/docker/docker/daemon/libnetwork/types"
)

// PoolID is the pointer to the configured pools in each address space
type PoolID struct {
        AddressSpace string
        SubnetKey
}

// PoolData contains the configured pool data
type PoolData struct {
        addrs    *addrset.AddrSet
        children map[netip.Prefix]struct{}

        // Whether to implicitly release the pool once it no longer has any children.
        autoRelease bool
}

// SubnetKey is the composite key to an address pool within an address space.
type SubnetKey struct {
        Subnet, ChildSubnet netip.Prefix
}

func (k SubnetKey) Is6() bool {
        return k.Subnet.Addr().Is6()
}

// PoolIDFromString creates a new PoolID and populates the SubnetKey object
// reading it from the given string.
func PoolIDFromString(str string) (pID PoolID, err error) {
        if str == "" {
                return pID, types.InvalidParameterErrorf("invalid string form for subnetkey: %s", str)
        }

        p := strings.Split(str, "/")
        if len(p) != 3 && len(p) != 5 {
                return pID, types.InvalidParameterErrorf("invalid string form for subnetkey: %s", str)
        }
        pID.AddressSpace = p[0]
        pID.Subnet, err = netip.ParsePrefix(p[1] + "/" + p[2])
        if err != nil {
                return pID, types.InvalidParameterErrorf("invalid string form for subnetkey: %s", str)
        }
        if len(p) == 5 {
                pID.ChildSubnet, err = netip.ParsePrefix(p[3] + "/" + p[4])
                if err != nil {
                        return pID, types.InvalidParameterErrorf("invalid string form for subnetkey: %s", str)
                }
        }

        return pID, nil
}

// String returns the string form of the SubnetKey object
func (s *PoolID) String() string {
        if s.ChildSubnet == (netip.Prefix{}) {
                return s.AddressSpace + "/" + s.Subnet.String()
        } else {
                return s.AddressSpace + "/" + s.Subnet.String() + "/" + s.ChildSubnet.String()
        }
}

// String returns the string form of the PoolData object
func (p *PoolData) String() string {
        return fmt.Sprintf("PoolData[Children: %d]", len(p.children))
}

// mergeIter is used to iterate on both 'a' and 'b' at the same time while
// maintaining the total order that would arise if both were merged and then
// sorted. Both 'a' and 'b' have to be sorted beforehand.
type mergeIter struct {
        a, b   []netip.Prefix
        ia, ib int
        cmp    func(a, b netip.Prefix) int
        lastA  bool
}

func newMergeIter(a, b []netip.Prefix, cmp func(a, b netip.Prefix) int) *mergeIter {
        iter := &mergeIter{
                a:   a,
                b:   b,
                cmp: cmp,
        }
        iter.lastA = iter.nextA()

        return iter
}

func (it *mergeIter) Get() netip.Prefix {
        if it.ia+it.ib >= len(it.a)+len(it.b) {
                return netip.Prefix{}
        }

        if it.lastA {
                return it.a[it.ia]
        }

        return it.b[it.ib]
}

func (it *mergeIter) Inc() {
        if it.lastA {
                it.ia++
        } else {
                it.ib++
        }

        it.lastA = it.nextA()
}

func (it *mergeIter) nextA() bool {
        if it.ia < len(it.a) && it.ib < len(it.b) && it.cmp(it.a[it.ia], it.b[it.ib]) <= 0 {
                return true
        } else if it.ia < len(it.a) && it.ib >= len(it.b) {
                return true
        }

        return false
}

package ipams

import (
        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/ipams/defaultipam"
        "github.com/docker/docker/daemon/libnetwork/ipams/null"
        remoteIpam "github.com/docker/docker/daemon/libnetwork/ipams/remote"
        "github.com/docker/docker/daemon/libnetwork/ipams/windowsipam"
        "github.com/docker/docker/daemon/libnetwork/ipamutils"
        "github.com/docker/docker/pkg/plugingetter"
)

// Register registers all the builtin drivers (ie. default, windowsipam, null
// and remote). If 'pg' is nil, the remote driver won't be registered.
func Register(r ipamapi.Registerer, pg plugingetter.PluginGetter, lAddrPools, gAddrPools []*ipamutils.NetworkToSplit) error {
        if err := defaultipam.Register(r, lAddrPools, gAddrPools); err != nil {
                return err
        }
        if err := windowsipam.Register(r); err != nil {
                return err
        }
        if err := null.Register(r); err != nil {
                return err
        }
        if pg != nil {
                if err := remoteIpam.Register(r, pg); err != nil {
                        return err
                }
        }
        return nil
}

// Package null implements the null ipam driver. Null ipam driver satisfies ipamapi contract,
// but does not effectively reserve/allocate any address pool or address
package null

import (
        "net"
        "net/netip"

        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        // DriverName is the name of the built-in null ipam driver
        DriverName = "null"

        defaultAddressSpace = "null"
        defaultPoolCIDR4    = "0.0.0.0/0"
        defaultPoolID4      = defaultAddressSpace + "/" + defaultPoolCIDR4
        defaultPoolCIDR6    = "::/0"
        defaultPoolID6      = defaultAddressSpace + "/" + defaultPoolCIDR6
)

var (
        defaultPool4 = netip.MustParsePrefix(defaultPoolCIDR4)
        defaultPool6 = netip.MustParsePrefix(defaultPoolCIDR6)
)

type allocator struct{}

func (a *allocator) GetDefaultAddressSpaces() (string, string, error) {
        return defaultAddressSpace, defaultAddressSpace, nil
}

func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, error) {
        if req.AddressSpace != defaultAddressSpace {
                return ipamapi.AllocatedPool{}, types.InvalidParameterErrorf("unknown address space: %s", req.AddressSpace)
        }
        if req.Pool != "" {
                return ipamapi.AllocatedPool{}, types.InvalidParameterErrorf("null ipam driver does not handle specific address pool requests")
        }
        if req.SubPool != "" {
                return ipamapi.AllocatedPool{}, types.InvalidParameterErrorf("null ipam driver does not handle specific address subpool requests")
        }
        if req.V6 {
                return ipamapi.AllocatedPool{
                        PoolID: defaultPoolID6,
                        Pool:   defaultPool6,
                }, nil
        }
        return ipamapi.AllocatedPool{
                PoolID: defaultPoolID4,
                Pool:   defaultPool4,
        }, nil
}

func (a *allocator) ReleasePool(poolID string) error {
        return nil
}

func (a *allocator) RequestAddress(poolID string, ip net.IP, opts map[string]string) (*net.IPNet, map[string]string, error) {
        if poolID != defaultPoolID4 && poolID != defaultPoolID6 {
                return nil, nil, types.InvalidParameterErrorf("unknown pool id: %s", poolID)
        }
        return nil, nil, nil
}

func (a *allocator) ReleaseAddress(poolID string, ip net.IP) error {
        if poolID != defaultPoolID4 && poolID != defaultPoolID6 {
                return types.InvalidParameterErrorf("unknown pool id: %s", poolID)
        }
        return nil
}

func (a *allocator) IsBuiltIn() bool {
        return true
}

// Register registers the null ipam driver with r.
func Register(r ipamapi.Registerer) error {
        return r.RegisterIpamDriver(DriverName, &allocator{})
}

// Package api defines the data structure to be used in the request/response
// messages between libnetwork and the remote ipam plugin
package api

import "github.com/docker/docker/daemon/libnetwork/ipamapi"

// Response is the basic response structure used in all responses
type Response struct {
        Error string
}

// IsSuccess returns whether the plugin response is successful
func (r *Response) IsSuccess() bool {
        return r.Error == ""
}

// GetError returns the error from the response, if any.
func (r *Response) GetError() string {
        return r.Error
}

// GetCapabilityResponse is the response of GetCapability request
type GetCapabilityResponse struct {
        Response
        RequiresMACAddress    bool
        RequiresRequestReplay bool
}

// ToCapability converts the capability response into the internal ipam driver capability structure
func (capRes GetCapabilityResponse) ToCapability() *ipamapi.Capability {
        return &ipamapi.Capability{
                RequiresMACAddress:    capRes.RequiresMACAddress,
                RequiresRequestReplay: capRes.RequiresRequestReplay,
        }
}

// GetAddressSpacesResponse is the response to the “get default address spaces“ request message
type GetAddressSpacesResponse struct {
        Response
        LocalDefaultAddressSpace  string
        GlobalDefaultAddressSpace string
}

// RequestPoolRequest represents the expected data in a “request address pool“ request message
type RequestPoolRequest struct {
        AddressSpace string
        Pool         string
        SubPool      string
        Options      map[string]string
        V6           bool
}

// RequestPoolResponse represents the response message to a “request address pool“ request
type RequestPoolResponse struct {
        Response
        PoolID string
        Pool   string // CIDR format
        Data   map[string]string
}

// ReleasePoolRequest represents the expected data in a “release address pool“ request message
type ReleasePoolRequest struct {
        PoolID string
}

// ReleasePoolResponse represents the response message to a “release address pool“ request
type ReleasePoolResponse struct {
        Response
}

// RequestAddressRequest represents the expected data in a “request address“ request message
type RequestAddressRequest struct {
        PoolID  string
        Address string
        Options map[string]string
}

// RequestAddressResponse represents the expected data in the response message to a “request address“ request
type RequestAddressResponse struct {
        Response
        Address string // in CIDR format
        Data    map[string]string
}

// ReleaseAddressRequest represents the expected data in a “release address“ request message
type ReleaseAddressRequest struct {
        PoolID  string
        Address string
}

// ReleaseAddressResponse represents the response message to a “release address“ request
type ReleaseAddressResponse struct {
        Response
}

package remote

import (
        "context"
        "fmt"
        "net"
        "net/netip"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/ipams/remote/api"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
        "github.com/pkg/errors"
)

type allocator struct {
        endpoint *plugins.Client
        name     string
}

// PluginResponse is the interface for the plugin request responses
type PluginResponse interface {
        IsSuccess() bool
        GetError() string
}

func newAllocator(name string, client *plugins.Client) ipamapi.Ipam {
        a := &allocator{name: name, endpoint: client}
        return a
}

// Register registers a remote ipam when its plugin is activated.
func Register(cb ipamapi.Registerer, pg plugingetter.PluginGetter) error {
        newPluginHandler := func(name string, client *plugins.Client) {
                a := newAllocator(name, client)
                if cps, err := a.(*allocator).getCapabilities(); err == nil {
                        if err := cb.RegisterIpamDriverWithCapabilities(name, a, cps); err != nil {
                                log.G(context.TODO()).Errorf("error registering remote ipam driver %s due to %v", name, err)
                        }
                } else {
                        log.G(context.TODO()).Infof("remote ipam driver %s does not support capabilities", name)
                        log.G(context.TODO()).Debug(err)
                        if err := cb.RegisterIpamDriver(name, a); err != nil {
                                log.G(context.TODO()).Errorf("error registering remote ipam driver %s due to %v", name, err)
                        }
                }
        }

        // Unit test code is unaware of a true PluginStore. So we fall back to v1 plugins.
        handleFunc := plugins.Handle
        if pg != nil {
                handleFunc = pg.Handle
                activePlugins := pg.GetAllManagedPluginsByCap(ipamapi.PluginEndpointType)
                for _, ap := range activePlugins {
                        client, err := getPluginClient(ap)
                        if err != nil {
                                return err
                        }
                        newPluginHandler(ap.Name(), client)
                }
        }
        handleFunc(ipamapi.PluginEndpointType, newPluginHandler)
        return nil
}

func getPluginClient(p plugingetter.CompatPlugin) (*plugins.Client, error) {
        if v1, ok := p.(plugingetter.PluginWithV1Client); ok {
                return v1.Client(), nil
        }

        pa, ok := p.(plugingetter.PluginAddr)
        if !ok {
                return nil, errors.Errorf("unknown plugin type %T", p)
        }

        if pa.Protocol() != plugins.ProtocolSchemeHTTPV1 {
                return nil, errors.Errorf("unsupported plugin protocol %s", pa.Protocol())
        }

        addr := pa.Addr()
        client, err := plugins.NewClientWithTimeout(addr.Network()+"://"+addr.String(), nil, pa.Timeout())
        if err != nil {
                return nil, errors.Wrap(err, "error creating plugin client")
        }
        return client, nil
}

func (a *allocator) call(methodName string, arg interface{}, retVal PluginResponse) error {
        method := ipamapi.PluginEndpointType + "." + methodName
        err := a.endpoint.Call(method, arg, retVal)
        if err != nil {
                return err
        }
        if !retVal.IsSuccess() {
                return fmt.Errorf("remote: %s", retVal.GetError())
        }
        return nil
}

func (a *allocator) getCapabilities() (*ipamapi.Capability, error) {
        var res api.GetCapabilityResponse
        if err := a.call("GetCapabilities", nil, &res); err != nil {
                return nil, err
        }
        return res.ToCapability(), nil
}

// GetDefaultAddressSpaces returns the local and global default address spaces
func (a *allocator) GetDefaultAddressSpaces() (string, string, error) {
        res := &api.GetAddressSpacesResponse{}
        if err := a.call("GetDefaultAddressSpaces", nil, res); err != nil {
                return "", "", err
        }
        return res.LocalDefaultAddressSpace, res.GlobalDefaultAddressSpace, nil
}

// RequestPool requests an address pool in the specified address space.
//
// This is a bug-for-bug re-implementation of the logic originally found in
// requestPoolHelper prior to v27. See https://github.com/moby/moby/blob/faf84d7f0a1f2e6badff6f720a3e1e559c356fff/libnetwork/network.go#L1518-L1570
func (a *allocator) RequestPool(req ipamapi.PoolRequest) (ipamapi.AllocatedPool, error) {
        var tmpPoolLeases []string
        defer func() {
                // Release all pools we held on to.
                for _, pID := range tmpPoolLeases {
                        if err := a.ReleasePool(pID); err != nil {
                                log.G(context.TODO()).Warnf("Failed to release overlapping pool")
                        }
                }
        }()

        _, globalSpace, err := a.GetDefaultAddressSpaces()
        if err != nil {
                return ipamapi.AllocatedPool{}, err
        }

        remoteReq := &api.RequestPoolRequest{
                AddressSpace: req.AddressSpace,
                Pool:         req.Pool,
                SubPool:      req.SubPool,
                Options:      req.Options,
                V6:           req.V6,
        }

        for {
                alloc, err := a.requestPool(remoteReq)
                if err != nil {
                        return alloc, err
                }

                // If the network pool was explicitly chosen, the network belongs to
                // global address space, or it is invalid ("0.0.0.0/0"), then we don't
                // perform check for overlaps.
                //
                // FIXME(thaJeztah): why are we ignoring invalid pools here?
                //
                // The "invalid" conditions was added in [libnetwork#1095][1], which
                // moved code to reduce os-specific dependencies in the ipam package,
                // but also introduced a types.IsIPNetValid() function, which considers
                // "0.0.0.0/0" invalid, and added it to the conditions below.
                //
                // Unfortunately review does not mention this change, so there's no
                // context why. Possibly this was done to prevent errors further down
                // the line (when checking for overlaps), but returning an error here
                // instead would likely have avoided that as well, so we can only guess.
                //
                // [1]: https://github.com/moby/libnetwork/commit/5ca79d6b87873264516323a7b76f0af7d0298492#diff-bdcd879439d041827d334846f9aba01de6e3683ed8fdd01e63917dae6df23846
                if req.Pool != "" || req.AddressSpace == globalSpace || alloc.Pool.String() == "0.0.0.0/0" {
                        return alloc, nil
                }

                // Check for overlap and if none found, we have found the right pool.
                if !checkOverlaps(alloc, req.Exclude) {
                        return alloc, nil
                }

                // Pool obtained in this iteration is overlapping. Hold onto the pool
                // and don't release it yet, because we don't want IPAM to give us back
                // the same pool over again. But make sure we still do a deferred release
                // when we have either obtained a non-overlapping pool or ran out of
                // pre-defined pools.
                tmpPoolLeases = append(tmpPoolLeases, alloc.PoolID)
        }
}

func (a *allocator) requestPool(req *api.RequestPoolRequest) (ipamapi.AllocatedPool, error) {
        res := &api.RequestPoolResponse{}
        if err := a.call("RequestPool", req, res); err != nil {
                return ipamapi.AllocatedPool{}, err
        }

        retPool, err := netip.ParsePrefix(res.Pool)
        return ipamapi.AllocatedPool{
                PoolID: res.PoolID,
                Pool:   retPool,
                Meta:   res.Data,
        }, err
}

// checkOverlaps returns true if the 'pool' overlaps with some prefix in 'reserved'.
func checkOverlaps(pool ipamapi.AllocatedPool, reserved []netip.Prefix) bool {
        for _, r := range reserved {
                if r.Overlaps(pool.Pool) {
                        return true
                }
        }
        return false
}

// ReleasePool removes an address pool from the specified address space
func (a *allocator) ReleasePool(poolID string) error {
        req := &api.ReleasePoolRequest{PoolID: poolID}
        res := &api.ReleasePoolResponse{}
        return a.call("ReleasePool", req, res)
}

// RequestAddress requests an address from the address pool
func (a *allocator) RequestAddress(poolID string, address net.IP, options map[string]string) (*net.IPNet, map[string]string, error) {
        var (
                prefAddress string
                retAddress  *net.IPNet
                err         error
        )
        if address != nil {
                prefAddress = address.String()
        }
        req := &api.RequestAddressRequest{PoolID: poolID, Address: prefAddress, Options: options}
        res := &api.RequestAddressResponse{}
        if err := a.call("RequestAddress", req, res); err != nil {
                return nil, nil, err
        }
        if res.Address != "" {
                retAddress, err = types.ParseCIDR(res.Address)
        } else {
                return nil, nil, ipamapi.ErrNoIPReturned
        }
        return retAddress, res.Data, err
}

// ReleaseAddress releases the address from the specified address pool
func (a *allocator) ReleaseAddress(poolID string, address net.IP) error {
        var relAddress string
        if address != nil {
                relAddress = address.String()
        }
        req := &api.ReleaseAddressRequest{PoolID: poolID, Address: relAddress}
        res := &api.ReleaseAddressResponse{}
        return a.call("ReleaseAddress", req, res)
}

func (a *allocator) IsBuiltIn() bool {
        return false
}

//go:build !windows

package windowsipam

import "github.com/docker/docker/daemon/libnetwork/ipamapi"

// Register is a no-op -- windowsipam is only supported on Windows.
func Register(ipamapi.Registerer) error {
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

// Package ipamutils provides utility functions for ipam management
package ipamutils

import (
        "net/netip"
        "slices"
)

var (
        localScopeDefaultNetworks = []*NetworkToSplit{
                {netip.MustParsePrefix("172.17.0.0/16"), 16},
                {netip.MustParsePrefix("172.18.0.0/16"), 16},
                {netip.MustParsePrefix("172.19.0.0/16"), 16},
                {netip.MustParsePrefix("172.20.0.0/14"), 16},
                {netip.MustParsePrefix("172.24.0.0/14"), 16},
                {netip.MustParsePrefix("172.28.0.0/14"), 16},
                {netip.MustParsePrefix("192.168.0.0/16"), 20},
        }
        globalScopeDefaultNetworks = []*NetworkToSplit{
                {netip.MustParsePrefix("10.0.0.0/8"), 24},
        }
)

// NetworkToSplit represent a network that has to be split in chunks with mask length Size.
// Each subnet in the set is derived from the Base pool. Base is to be passed
// in CIDR format.
// Example: a Base "10.10.0.0/16 with Size 24 will define the set of 256
// 10.10.[0-255].0/24 address pools
type NetworkToSplit struct {
        Base netip.Prefix `json:"base"`
        Size int          `json:"size"`
}

// FirstPrefix returns the first prefix available in NetworkToSplit.
func (n NetworkToSplit) FirstPrefix() netip.Prefix {
        return netip.PrefixFrom(n.Base.Addr(), n.Size)
}

// Overlaps is a util function checking whether 'p' overlaps with 'n'.
func (n NetworkToSplit) Overlaps(p netip.Prefix) bool {
        return n.Base.Overlaps(p)
}

// GetGlobalScopeDefaultNetworks returns a copy of the global-scope network list.
func GetGlobalScopeDefaultNetworks() []*NetworkToSplit {
        return slices.Clone(globalScopeDefaultNetworks)
}

// GetLocalScopeDefaultNetworks returns a copy of the default local-scope network list.
func GetLocalScopeDefaultNetworks() []*NetworkToSplit {
        return slices.Clone(localScopeDefaultNetworks)
}

// Package ipbits contains utilities for manipulating [netip.Addr] values as
// numbers or bitfields.
package ipbits

import (
        "encoding/binary"
        "net/netip"
)

// Add returns ip + (x << shift).
func Add(ip netip.Addr, x uint64, shift uint) netip.Addr {
        if ip.Is4() {
                a := ip.As4()
                addr := binary.BigEndian.Uint32(a[:])
                addr += uint32(x) << shift
                binary.BigEndian.PutUint32(a[:], addr)
                return netip.AddrFrom4(a)
        } else {
                a := ip.As16()
                addr := uint128From16(a)
                addr = addr.add(uint128From(x).lsh(shift))
                addr.fill16(&a)
                return netip.AddrFrom16(a)
        }
}

// SubnetsBetween computes the number of subnets of size 'sz' available between 'a1'
// and 'a2'. The result is capped at [math.MaxUint64]. It returns 0 when one of
// 'a1' or 'a2' is invalid, if both aren't of the same family, or when 'a2' is
// less than 'a1'.
func SubnetsBetween(a1 netip.Addr, a2 netip.Addr, sz int) uint64 {
        if !a1.IsValid() || !a2.IsValid() || a1.Is4() != a2.Is4() || a2.Less(a1) {
                return 0
        }

        p1, _ := a1.Prefix(sz)
        p2, _ := a2.Prefix(sz)

        return subAddr(p2.Addr(), p1.Addr()).rsh(uint(a1.BitLen() - sz)).uint64()
}

// subAddr returns 'ip1 - ip2'. Both netip.Addr have to be of the same address
// family. 'ip1' as to be greater than or equal to 'ip2'.
func subAddr(ip1 netip.Addr, ip2 netip.Addr) uint128 {
        return uint128From16(ip1.As16()).sub(uint128From16(ip2.As16()))
}

// Field returns the value of the bitfield [u, v] in ip as an integer,
// where bit 0 is the most-significant bit of ip.
//
// The result is undefined if u > v, if v-u > 64, or if u or v is larger than
// ip.BitLen().
func Field(ip netip.Addr, u, v uint) uint64 {
        if ip.Is4() {
                mask := ^uint32(0) >> u
                a := ip.As4()
                return uint64((binary.BigEndian.Uint32(a[:]) & mask) >> (32 - v))
        } else {
                mask := uint128From(0).not().rsh(u)
                return uint128From16(ip.As16()).and(mask).rsh(128 - v).uint64()
        }
}

package ipbits

import (
        "encoding/binary"
        "math/bits"
)

type uint128 struct{ hi, lo uint64 }

func uint128From16(b [16]byte) uint128 {
        return uint128{
                hi: binary.BigEndian.Uint64(b[:8]),
                lo: binary.BigEndian.Uint64(b[8:]),
        }
}

func uint128From(x uint64) uint128 {
        return uint128{lo: x}
}

func (x uint128) add(y uint128) uint128 {
        lo, carry := bits.Add64(x.lo, y.lo, 0)
        hi, _ := bits.Add64(x.hi, y.hi, carry)
        return uint128{hi: hi, lo: lo}
}

func (x uint128) sub(y uint128) uint128 {
        lo, carry := bits.Sub64(x.lo, y.lo, 0)
        hi, _ := bits.Sub64(x.hi, y.hi, carry)
        return uint128{hi: hi, lo: lo}
}

func (x uint128) lsh(n uint) uint128 {
        if n > 64 {
                return uint128{hi: x.lo << (n - 64)}
        }
        return uint128{
                hi: x.hi<<n | x.lo>>(64-n),
                lo: x.lo << n,
        }
}

func (x uint128) rsh(n uint) uint128 {
        if n > 64 {
                return uint128{lo: x.hi >> (n - 64)}
        }
        return uint128{
                hi: x.hi >> n,
                lo: x.lo>>n | x.hi<<(64-n),
        }
}

func (x uint128) and(y uint128) uint128 {
        return uint128{hi: x.hi & y.hi, lo: x.lo & y.lo}
}

func (x uint128) not() uint128 {
        return uint128{hi: ^x.hi, lo: ^x.lo}
}

func (x uint128) fill16(a *[16]byte) {
        binary.BigEndian.PutUint64(a[:8], x.hi)
        binary.BigEndian.PutUint64(a[8:], x.lo)
}

func (x uint128) uint64() uint64 {
        return x.lo
}

//go:build linux

package iptables

import (
        "context"
        "errors"
        "net"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/vishvananda/netlink"
)

// checkConntrackProgrammable checks if the handle supports the
// NETLINK_NETFILTER and the base modules are loaded.
func checkConntrackProgrammable(nlh nlwrap.Handle) error {
        if !nlh.SupportsNetlinkFamily(syscall.NETLINK_NETFILTER) {
                return errors.New("conntrack is not available")
        }
        return nil
}

// DeleteConntrackEntries deletes all the conntrack connections on the host for the specified IP
// Returns the number of flows deleted for IPv4, IPv6 else error
func DeleteConntrackEntries(nlh nlwrap.Handle, ipv4List []net.IP, ipv6List []net.IP) error {
        if err := checkConntrackProgrammable(nlh); err != nil {
                return err
        }

        var totalIPv4FlowPurged uint
        for _, ipAddress := range ipv4List {
                flowPurged, err := purgeConntrackState(nlh, syscall.AF_INET, ipAddress)
                if err != nil {
                        log.G(context.TODO()).Warnf("Failed to delete conntrack state for %s: %v", ipAddress, err)
                        continue
                }
                totalIPv4FlowPurged += flowPurged
        }

        var totalIPv6FlowPurged uint
        for _, ipAddress := range ipv6List {
                flowPurged, err := purgeConntrackState(nlh, syscall.AF_INET6, ipAddress)
                if err != nil {
                        log.G(context.TODO()).Warnf("Failed to delete conntrack state for %s: %v", ipAddress, err)
                        continue
                }
                totalIPv6FlowPurged += flowPurged
        }

        if totalIPv4FlowPurged > 0 || totalIPv6FlowPurged > 0 {
                log.G(context.TODO()).Debugf("DeleteConntrackEntries purged ipv4:%d, ipv6:%d", totalIPv4FlowPurged, totalIPv6FlowPurged)
        }

        return nil
}

func DeleteConntrackEntriesByPort(nlh nlwrap.Handle, proto types.Protocol, ports []uint16) error {
        if err := checkConntrackProgrammable(nlh); err != nil {
                return err
        }

        var totalIPv4FlowPurged uint
        var totalIPv6FlowPurged uint

        for _, port := range ports {
                filter := &netlink.ConntrackFilter{}
                if err := filter.AddProtocol(uint8(proto)); err != nil {
                        log.G(context.TODO()).Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err)
                        continue
                }
                if err := filter.AddPort(netlink.ConntrackOrigDstPort, port); err != nil {
                        log.G(context.TODO()).Warnf("Failed to delete conntrack state for %s port %d: %v", proto.String(), port, err)
                        continue
                }

                v4FlowPurged, err := nlh.ConntrackDeleteFilters(netlink.ConntrackTable, syscall.AF_INET, filter)
                if err != nil {
                        log.G(context.TODO()).Warnf("Failed to delete conntrack state for IPv4 %s port %d: %v", proto.String(), port, err)
                }
                totalIPv4FlowPurged += v4FlowPurged

                v6FlowPurged, err := nlh.ConntrackDeleteFilters(netlink.ConntrackTable, syscall.AF_INET6, filter)
                if err != nil {
                        log.G(context.TODO()).Warnf("Failed to delete conntrack state for IPv6 %s port %d: %v", proto.String(), port, err)
                }
                totalIPv6FlowPurged += v6FlowPurged
        }

        if totalIPv4FlowPurged > 0 || totalIPv6FlowPurged > 0 {
                log.G(context.TODO()).Debugf("DeleteConntrackEntriesByPort for %s ports purged ipv4:%d, ipv6:%d", proto.String(), totalIPv4FlowPurged, totalIPv6FlowPurged)
        }

        return nil
}

func purgeConntrackState(nlh nlwrap.Handle, family netlink.InetFamily, ipAddress net.IP) (uint, error) {
        filter := &netlink.ConntrackFilter{}
        // NOTE: doing the flush using the ipAddress is safe because today there cannot be multiple networks with the same subnet
        // so it will not be possible to flush flows that are of other containers
        if err := filter.AddIP(netlink.ConntrackNatAnyIP, ipAddress); err != nil {
                return 0, err
        }
        return nlh.ConntrackDeleteFilters(netlink.ConntrackTable, family, filter)
}

//go:build linux

package iptables

import (
        "context"
        "fmt"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/log"
        dbus "github.com/godbus/dbus/v5"
        "github.com/pkg/errors"
)

const (
        dbusInterface   = "org.fedoraproject.FirewallD1"
        dbusPath        = "/org/fedoraproject/FirewallD1"
        dbusConfigPath  = "/org/fedoraproject/FirewallD1/config"
        dockerZone      = "docker"
        dockerFwdPolicy = "docker-forwarding"
)

// Conn is a connection to firewalld dbus endpoint.
type Conn struct {
        sysconn    *dbus.Conn
        sysObj     dbus.BusObject
        sysConfObj dbus.BusObject
        signal     chan *dbus.Signal
}

var (
        connection *Conn

        firewalldRunning bool // is Firewalld service running
        // Time of the last firewalld reload.
        firewalldReloadedAt atomic.Value
        // Mutex to serialise firewalld reload callbacks.
        firewalldReloadMu sync.Mutex
        onReloaded        []*func() // callbacks when Firewalld has been reloaded
)

// UsingFirewalld returns true if iptables rules will be applied via firewalld's
// passthrough interface.
func UsingFirewalld() bool {
        _ = initCheck()
        return firewalldRunning
}

// FirewalldReloadedAt returns the time at which the daemon last completed a
// firewalld reload, or a zero-valued time.Time if it has not been reloaded
// since the daemon started.
func FirewalldReloadedAt() time.Time {
        val := firewalldReloadedAt.Load()
        if val == nil {
                return time.Time{}
        }
        return val.(time.Time)
}

// firewalldInit initializes firewalld management code.
func firewalldInit() error {
        var err error

        if connection, err = newConnection(); err != nil {
                return fmt.Errorf("Failed to connect to D-Bus system bus: %v", err)
        }
        firewalldRunning = checkRunning()
        if !firewalldRunning {
                connection.sysconn.Close()
                connection = nil
        }
        if connection != nil {
                go signalHandler()
                zoneAdded, err := setupDockerZone()
                if err != nil {
                        return err
                }
                policyAdded, policyAddErr := setupDockerForwardingPolicy()
                if policyAddErr != nil {
                        // Log the error, but still reload firewalld if necessary.
                        log.G(context.TODO()).WithError(policyAddErr).Warnf("Firewalld: failed to add policy %s", dockerFwdPolicy)
                }
                if zoneAdded || policyAdded {
                        // Reload for changes to take effect.
                        if err := connection.sysObj.Call(dbusInterface+".reload", 0).Err; err != nil {
                                return err
                        }
                }
        }

        return nil
}

// newConnection establishes a connection to the system bus.
func newConnection() (*Conn, error) {
        c := &Conn{}

        var err error
        c.sysconn, err = dbus.SystemBus()
        if err != nil {
                return nil, err
        }

        // This never fails, even if the service is not running atm.
        c.sysObj = c.sysconn.Object(dbusInterface, dbusPath)
        c.sysConfObj = c.sysconn.Object(dbusInterface, dbusConfigPath)

        rule := fmt.Sprintf("type='signal',path='%s',interface='%s',sender='%s',member='Reloaded'", dbusPath, dbusInterface, dbusInterface)
        c.sysconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0, rule)

        rule = fmt.Sprintf("type='signal',interface='org.freedesktop.DBus',member='NameOwnerChanged',path='/org/freedesktop/DBus',sender='org.freedesktop.DBus',arg0='%s'", dbusInterface)
        c.sysconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0, rule)

        c.signal = make(chan *dbus.Signal, 10)
        c.sysconn.Signal(c.signal)
        return c, nil
}

func signalHandler() {
        for signal := range connection.signal {
                switch {
                case strings.Contains(signal.Name, "NameOwnerChanged"):
                        firewalldRunning = checkRunning()
                        dbusConnectionChanged(signal.Body)

                case strings.Contains(signal.Name, "Reloaded"):
                        reloaded()
                }
        }
}

func dbusConnectionChanged(args []interface{}) {
        name := args[0].(string)
        oldOwner := args[1].(string)
        newOwner := args[2].(string)

        if name != dbusInterface {
                return
        }

        if newOwner != "" {
                connectionEstablished()
        } else if oldOwner != "" {
                connectionLost()
        }
}

func connectionEstablished() {
        reloaded()
}

func connectionLost() {
        // Doesn't do anything for now. Libvirt also doesn't react to this.
}

// call all callbacks
func reloaded() {
        firewalldReloadMu.Lock()
        defer firewalldReloadMu.Unlock()
        for _, pf := range onReloaded {
                (*pf)()
        }
        firewalldReloadedAt.Store(time.Now())
}

// OnReloaded add callback
func OnReloaded(callback func()) {
        for _, pf := range onReloaded {
                if pf == &callback {
                        return
                }
        }
        onReloaded = append(onReloaded, &callback)
}

// Call some remote method to see whether the service is actually running.
func checkRunning() bool {
        if connection == nil {
                return false
        }
        var zone string
        err := connection.sysObj.Call(dbusInterface+".getDefaultZone", 0).Store(&zone)
        return err == nil
}

// passthrough method simply passes args through to iptables/ip6tables
func passthrough(ipv IPVersion, args ...string) ([]byte, error) {
        var output string
        log.G(context.TODO()).Debugf("Firewalld passthrough: %s, %s", ipv, args)
        if err := connection.sysObj.Call(dbusInterface+".direct.passthrough", 0, ipv, args).Store(&output); err != nil {
                return nil, err
        }
        return []byte(output), nil
}

// firewalldZone holds the firewalld zone settings.
//
// Documented in https://firewalld.org/documentation/man-pages/firewalld.dbus.html#FirewallD1.zone
type firewalldZone struct {
        version            string
        name               string
        description        string
        unused             bool
        target             string
        services           []string
        ports              [][]interface{}
        icmpBlocks         []string
        masquerade         bool
        forwardPorts       [][]interface{}
        interfaces         []string
        sourceAddresses    []string
        richRules          []string
        protocols          []string
        sourcePorts        [][]interface{}
        icmpBlockInversion bool
}

// settings returns the firewalldZone struct as an interface slice, which can be
// passed to "org.fedoraproject.FirewallD1.config.addZone". Note that 'addZone',
// which is deprecated, requires this whole struct. Its replacement, 'addZone2'
// (introduced in firewalld 0.9.0) accepts a dictionary where only non-default
// values need to be specified.
func (z firewalldZone) settings() []interface{} {
        return []interface{}{
                z.version,
                z.name,
                z.description,
                z.unused,
                z.target,
                z.services,
                z.ports,
                z.icmpBlocks,
                z.masquerade,
                z.forwardPorts,
                z.interfaces,
                z.sourceAddresses,
                z.richRules,
                z.protocols,
                z.sourcePorts,
                z.icmpBlockInversion,
        }
}

// setupDockerZone creates a zone called docker in firewalld which includes docker interfaces to allow
// container networking. The bool return value is true if a firewalld reload is required.
func setupDockerZone() (bool, error) {
        var zones []string
        // Check if zone exists
        if err := connection.sysObj.Call(dbusInterface+".zone.getZones", 0).Store(&zones); err != nil {
                return false, err
        }
        if contains(zones, dockerZone) {
                log.G(context.TODO()).Infof("Firewalld: %s zone already exists, returning", dockerZone)
                return false, nil
        }
        log.G(context.TODO()).Debugf("Firewalld: creating %s zone", dockerZone)

        // Permanent
        dz := firewalldZone{
                version:     "1.0",
                name:        dockerZone,
                description: "zone for docker bridge network interfaces",
                target:      "ACCEPT",
        }
        if err := connection.sysConfObj.Call(dbusInterface+".config.addZone", 0, dockerZone, dz.settings()).Err; err != nil {
                return false, err
        }
        return true, nil
}

// setupDockerForwardingPolicy creates a policy to allow forwarding to anywhere to the docker
// zone (where packets will be dealt with by docker's usual/non-firewalld configuration).
// The bool return value is true if a firewalld reload is required.
func setupDockerForwardingPolicy() (bool, error) {
        // https://firewalld.org/documentation/man-pages/firewalld.dbus.html#FirewallD1.config
        policy := map[string]interface{}{
                "version":       "1.0",
                "description":   "allow forwarding to the docker zone",
                "ingress_zones": []string{"ANY"},
                "egress_zones":  []string{dockerZone},
                "target":        "ACCEPT",
        }
        if err := connection.sysConfObj.Call(dbusInterface+".config.addPolicy", 0, dockerFwdPolicy, policy).Err; err != nil {
                var derr dbus.Error
                if errors.As(err, &derr) {
                        if derr.Name == dbusInterface+".Exception" && strings.HasPrefix(err.Error(), "NAME_CONFLICT") {
                                log.G(context.TODO()).Debugf("Firewalld: %s policy already exists", dockerFwdPolicy)
                                return false, nil
                        }
                        if derr.Name == dbus.ErrMsgUnknownMethod.Name {
                                log.G(context.TODO()).Debugf("Firewalld: addPolicy %s: unknown method", dockerFwdPolicy)
                                return false, nil
                        }
                }
                return false, err
        }
        log.G(context.TODO()).Infof("Firewalld: created %s policy", dockerFwdPolicy)
        return true, nil
}

// AddInterfaceFirewalld adds the interface to the trusted zone. It is a
// no-op if firewalld is not running.
func AddInterfaceFirewalld(intf string) error {
        if !firewalldRunning {
                return nil
        }

        var intfs []string
        // Check if interface is already added to the zone
        if err := connection.sysObj.Call(dbusInterface+".zone.getInterfaces", 0, dockerZone).Store(&intfs); err != nil {
                return err
        }
        // Return if interface is already part of the zone
        if contains(intfs, intf) {
                log.G(context.TODO()).Infof("Firewalld: interface %s already part of %s zone, returning", intf, dockerZone)
                return nil
        }

        log.G(context.TODO()).Debugf("Firewalld: adding %s interface to %s zone", intf, dockerZone)
        // Runtime
        if err := connection.sysObj.Call(dbusInterface+".zone.addInterface", 0, dockerZone, intf).Err; err != nil {
                return err
        }
        return nil
}

// DelInterfaceFirewalld removes the interface from the trusted zone It is a
// no-op if firewalld is not running.
func DelInterfaceFirewalld(intf string) error {
        if !firewalldRunning {
                return nil
        }

        var intfs []string
        // Check if interface is part of the zone
        if err := connection.sysObj.Call(dbusInterface+".zone.getInterfaces", 0, dockerZone).Store(&intfs); err != nil {
                return err
        }
        // Remove interface if it exists
        if !contains(intfs, intf) {
                return &interfaceNotFound{fmt.Errorf("firewalld: interface %q not found in %s zone", intf, dockerZone)}
        }

        log.G(context.TODO()).Debugf("Firewalld: removing %s interface from %s zone", intf, dockerZone)
        // Runtime
        if err := connection.sysObj.Call(dbusInterface+".zone.removeInterface", 0, dockerZone, intf).Err; err != nil {
                return err
        }
        return nil
}

type interfaceNotFound struct{ error }

func (interfaceNotFound) NotFound() {}

func contains(list []string, val string) bool {
        for _, v := range list {
                if v == val {
                        return true
                }
        }
        return false
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23 && linux

package iptables

import (
        "context"
        "errors"
        "fmt"
        "net/netip"
        "os/exec"
        "slices"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/rootless"
)

// Action signifies the iptable action.
type Action string

const (
        // Append appends the rule at the end of the chain.
        Append Action = "-A"
        // Delete deletes the rule from the chain.
        Delete Action = "-D"
        // Insert inserts the rule at the top of the chain.
        Insert Action = "-I"
)

// Policy is the default iptable policies
type Policy string

const (
        // Drop is the default iptables DROP policy.
        Drop Policy = "DROP"
        // Accept is the default iptables ACCEPT policy.
        Accept Policy = "ACCEPT"
)

// Table refers to Nat, Filter or Mangle.
type Table string

const (
        // Nat table is used for nat translation rules.
        Nat Table = "nat"
        // Filter table is used for filter rules.
        Filter Table = "filter"
        // Mangle table is used for mangling the packet.
        Mangle Table = "mangle"
        // Raw table is used for filtering packets before they are NATed.
        Raw Table = "raw"
)

// IPVersion refers to IP version, v4 or v6
type IPVersion string

const (
        // IPv4 is version 4.
        IPv4 IPVersion = "ipv4"
        // IPv6 is version 6.
        IPv6 IPVersion = "ipv6"
)

var (
        iptablesPath  string
        ip6tablesPath string
        initOnce      sync.Once
)

// IPTable defines struct with [IPVersion].
type IPTable struct {
        ipVersion IPVersion
}

// ChainInfo defines the iptables chain.
type ChainInfo struct {
        Name      string
        Table     Table
        IPVersion IPVersion
}

// ChainError is returned to represent errors during ip table operation.
type ChainError struct {
        Chain  string
        Output []byte
}

func (e ChainError) Error() string {
        return fmt.Sprintf("error iptables %s: %s", e.Chain, string(e.Output))
}

// loopbackAddress returns the loopback address for the given IP version.
func loopbackAddress(version IPVersion) string {
        switch version {
        case IPv4, "":
                // IPv4 (default for backward-compatibility)
                return "127.0.0.0/8"
        case IPv6:
                return "::1/128"
        default:
                panic("unknown IP version: " + version)
        }
}

func detectIptables() {
        path, err := exec.LookPath("iptables")
        if err != nil {
                log.G(context.TODO()).WithError(err).Warnf("failed to find iptables")
                return
        }
        iptablesPath = path

        path, err = exec.LookPath("ip6tables")
        if err != nil {
                log.G(context.TODO()).WithError(err).Warnf("unable to find ip6tables")
        } else {
                ip6tablesPath = path
        }
}

func initFirewalld() {
        // When running with RootlessKit, firewalld is running as the root outside our network namespace
        // https://github.com/moby/moby/issues/43781
        if rootless.RunningWithRootlessKit() {
                log.G(context.TODO()).Info("skipping firewalld management for rootless mode")
                return
        }
        if err := firewalldInit(); err != nil {
                log.G(context.TODO()).WithError(err).Debugf("unable to initialize firewalld; using raw iptables instead")
        }
}

func initDependencies() {
        initFirewalld()
        detectIptables()
}

func initCheck() error {
        initOnce.Do(initDependencies)

        if iptablesPath == "" {
                return errors.New("iptables not found")
        }
        return nil
}

// GetIptable returns an instance of IPTable with specified version ([IPv4]
// or [IPv6]). It panics if an invalid [IPVersion] is provided.
func GetIptable(version IPVersion) *IPTable {
        switch version {
        case IPv4, IPv6:
                // valid version
        case "":
                // default is IPv4 for backward-compatibility
                version = IPv4
        default:
                panic("unknown IP version: " + version)
        }
        return &IPTable{ipVersion: version}
}

// NewChain adds a new chain to ip table.
func (iptable IPTable) NewChain(name string, table Table) (*ChainInfo, error) {
        if name == "" {
                return nil, errors.New("could not create chain: chain name is empty")
        }
        if table == "" {
                return nil, fmt.Errorf("could not create chain %s: invalid table name: table name is empty", name)
        }
        // Add chain if it doesn't exist
        if _, err := iptable.Raw("-t", string(table), "-n", "-L", name); err != nil {
                if output, err := iptable.Raw("-t", string(table), "-N", name); err != nil {
                        return nil, err
                } else if len(output) != 0 {
                        return nil, fmt.Errorf("could not create %s/%s chain: %s", table, name, output)
                }
        }
        return &ChainInfo{
                Name:      name,
                Table:     table,
                IPVersion: iptable.ipVersion,
        }, nil
}

// RemoveExistingChain removes existing chain from the table.
func (iptable IPTable) RemoveExistingChain(name string, table Table) error {
        if name == "" {
                return errors.New("could not remove chain: chain name is empty")
        }
        if table == "" {
                return fmt.Errorf("could not remove chain %s: invalid table name: table name is empty", name)
        }
        c := &ChainInfo{
                Name:      name,
                Table:     table,
                IPVersion: iptable.ipVersion,
        }
        return c.Remove()
}

// Link adds reciprocal ACCEPT rule for two supplied IP addresses.
// Traffic is allowed from ip1 to ip2 and vice-versa
func (c *ChainInfo) Link(action Action, ip1, ip2 netip.Addr, port int, proto string, bridgeName string) error {
        iptable := GetIptable(c.IPVersion)
        // forward
        args := []string{
                "-i", bridgeName, "-o", bridgeName,
                "-p", proto,
                "-s", ip1.String(),
                "-d", ip2.String(),
                "--dport", strconv.Itoa(port),
                "-j", "ACCEPT",
        }

        if err := iptable.ProgramRule(Filter, c.Name, action, args); err != nil {
                return err
        }
        // reverse
        args[7], args[9] = args[9], args[7]
        args[10] = "--sport"
        return iptable.ProgramRule(Filter, c.Name, action, args)
}

// ProgramRule adds the rule specified by args only if the
// rule is not already present in the chain. Reciprocally,
// it removes the rule only if present.
func (iptable IPTable) ProgramRule(table Table, chain string, action Action, args []string) error {
        if iptable.Exists(table, chain, args...) != (action == Delete) {
                return nil
        }
        return iptable.RawCombinedOutput(append([]string{"-t", string(table), string(action), chain}, args...)...)
}

// Prerouting adds linking rule to nat/PREROUTING chain.
func (c *ChainInfo) Prerouting(action Action, args ...string) error {
        iptable := GetIptable(c.IPVersion)
        a := []string{"-t", string(Nat), string(action), "PREROUTING"}
        if len(args) > 0 {
                a = append(a, args...)
        }
        if output, err := iptable.Raw(a...); err != nil {
                return err
        } else if len(output) != 0 {
                return ChainError{Chain: "PREROUTING", Output: output}
        }
        return nil
}

// Output adds linking rule to an OUTPUT chain.
func (c *ChainInfo) Output(action Action, args ...string) error {
        a := []string{"-t", string(c.Table), string(action), "OUTPUT"}
        if len(args) > 0 {
                a = append(a, args...)
        }
        if output, err := GetIptable(c.IPVersion).Raw(a...); err != nil {
                return err
        } else if len(output) != 0 {
                return ChainError{Chain: "OUTPUT", Output: output}
        }
        return nil
}

// Remove removes the chain.
func (c *ChainInfo) Remove() error {
        // Ignore errors - This could mean the chains were never set up
        if c.Table == Nat {
                _ = c.Prerouting(Delete, "-m", "addrtype", "--dst-type", "LOCAL", "-j", c.Name)
                _ = c.Output(Delete, "-m", "addrtype", "--dst-type", "LOCAL", "!", "--dst", loopbackAddress(c.IPVersion), "-j", c.Name)
                _ = c.Output(Delete, "-m", "addrtype", "--dst-type", "LOCAL", "-j", c.Name) // Created in versions <= 0.1.6
                _ = c.Prerouting(Delete)
                _ = c.Output(Delete)
        }
        iptable := GetIptable(c.IPVersion)
        _, _ = iptable.Raw("-t", string(c.Table), "-F", c.Name)
        _, _ = iptable.Raw("-t", string(c.Table), "-X", c.Name)
        return nil
}

// Exists checks if a rule exists
func (iptable IPTable) Exists(table Table, chain string, rule ...string) bool {
        return iptable.exists(false, table, chain, rule...)
}

// ExistsNative behaves as Exists with the difference it
// will always invoke `iptables` binary.
func (iptable IPTable) ExistsNative(table Table, chain string, rule ...string) bool {
        return iptable.exists(true, table, chain, rule...)
}

func (iptable IPTable) exists(native bool, table Table, chain string, rule ...string) bool {
        if err := initCheck(); err != nil {
                // The exists() signature does not allow us to return an error, but at least
                // we can skip the (likely invalid) exec invocation.
                return false
        }

        f := iptable.Raw
        if native {
                f = iptable.raw
        }

        if table == "" {
                table = Filter
        }

        // if exit status is 0 then return true, the rule exists
        _, err := f(append([]string{"-t", string(table), "-C", chain}, rule...)...)
        return err == nil
}

const (
        // opWarnTime is the maximum duration that an iptables operation can take before flagging a warning.
        opWarnTime = 2 * time.Second

        // xLockWaitMsg is the iptables warning about xtables lock that can be suppressed.
        xLockWaitMsg = "Another app is currently holding the xtables lock"
)

func filterOutput(start time.Time, output []byte, args ...string) []byte {
        if opTime := time.Since(start); opTime > opWarnTime {
                // Flag operations that have taken a long time to complete
                log.G(context.TODO()).Warnf("xtables contention detected while running [%s]: Waited for %.2f seconds and received %q", strings.Join(args, " "), float64(opTime)/float64(time.Second), string(output))
        }
        // ignore iptables' message about xtables lock:
        // it is a warning, not an error.
        if strings.Contains(string(output), xLockWaitMsg) {
                output = []byte("")
        }
        // Put further filters here if desired
        return output
}

// Raw calls 'iptables' system command, passing supplied arguments.
func (iptable IPTable) Raw(args ...string) ([]byte, error) {
        if firewalldRunning {
                startTime := time.Now()
                output, err := passthrough(iptable.ipVersion, args...)
                if err == nil || !strings.Contains(err.Error(), "was not provided by any .service files") {
                        return filterOutput(startTime, output, args...), err
                }
        }
        return iptable.raw(args...)
}

func (iptable IPTable) raw(args ...string) ([]byte, error) {
        if err := initCheck(); err != nil {
                return nil, err
        }
        path := iptablesPath
        commandName := "iptables"
        if iptable.ipVersion == IPv6 {
                if ip6tablesPath == "" {
                        return nil, errors.New("ip6tables is missing")
                }
                path = ip6tablesPath
                commandName = "ip6tables"
        }

        args = append([]string{"--wait"}, args...)
        log.G(context.TODO()).Debugf("%s, %v", path, args)

        startTime := time.Now()
        output, err := exec.Command(path, args...).CombinedOutput()
        if err != nil {
                return nil, fmt.Errorf("iptables failed: %s %v: %s (%s)", commandName, strings.Join(args, " "), output, err)
        }

        return filterOutput(startTime, output, args...), err
}

// RawCombinedOutput internally calls the Raw function and returns a non nil
// error if Raw returned a non nil error or a non empty output
func (iptable IPTable) RawCombinedOutput(args ...string) error {
        if output, err := iptable.Raw(args...); err != nil || len(output) != 0 {
                return fmt.Errorf("%s (%v)", string(output), err)
        }
        return nil
}

// RawCombinedOutputNative behave as RawCombinedOutput with the difference it
// will always invoke `iptables` binary
func (iptable IPTable) RawCombinedOutputNative(args ...string) error {
        if output, err := iptable.raw(args...); err != nil || len(output) != 0 {
                return fmt.Errorf("%s (%v)", string(output), err)
        }
        return nil
}

// ExistChain checks if a chain exists
func (iptable IPTable) ExistChain(chain string, table Table) bool {
        _, err := iptable.Raw("-t", string(table), "-nL", chain)
        return err == nil
}

// SetDefaultPolicy sets the passed default policy for the table/chain
func (iptable IPTable) SetDefaultPolicy(table Table, chain string, policy Policy) error {
        if err := iptable.RawCombinedOutput("-t", string(table), "-P", chain, string(policy)); err != nil {
                return fmt.Errorf("setting default policy to %v in %v chain failed: %v", policy, chain, err)
        }
        return nil
}

// AddReturnRule adds a return rule for the chain in the filter table
func (iptable IPTable) AddReturnRule(chain string) error {
        if iptable.Exists(Filter, chain, "-j", "RETURN") {
                return nil
        }
        if err := iptable.RawCombinedOutput("-A", chain, "-j", "RETURN"); err != nil {
                return fmt.Errorf("unable to add return rule in %s chain: %v", chain, err)
        }
        return nil
}

// EnsureJumpRule ensures the jump rule is on top
func (iptable IPTable) EnsureJumpRule(fromChain, toChain string, rule ...string) error {
        if err := iptable.DeleteJumpRule(fromChain, toChain, rule...); err != nil {
                return err
        }
        rule = append(rule, "-j", toChain)
        if err := iptable.RawCombinedOutput(append([]string{"-I", fromChain}, rule...)...); err != nil {
                return fmt.Errorf("unable to insert jump to %s rule in %s chain: %v", toChain, fromChain, err)
        }
        return nil
}

// DeleteJumpRule deletes a rule added by EnsureJumpRule. It's a no-op if the rule
// doesn't exist.
func (iptable IPTable) DeleteJumpRule(fromChain, toChain string, rule ...string) error {
        rule = append(rule, "-j", toChain)
        if iptable.Exists(Filter, fromChain, rule...) {
                if err := iptable.RawCombinedOutput(append([]string{"-D", fromChain}, rule...)...); err != nil {
                        return fmt.Errorf("unable to remove jump to %s rule in %s chain: %v", toChain, fromChain, err)
                }
        }
        return nil
}

type Rule struct {
        IPVer IPVersion
        Table Table
        Chain string
        Args  []string
}

// Exists returns true if the rule exists in the kernel.
func (r Rule) Exists() bool {
        return GetIptable(r.IPVer).Exists(r.Table, r.Chain, r.Args...)
}

func (r Rule) cmdArgs(op Action) []string {
        return append([]string{"-t", string(r.Table), string(op), r.Chain}, r.Args...)
}

func (r Rule) exec(op Action) error {
        return GetIptable(r.IPVer).RawCombinedOutput(r.cmdArgs(op)...)
}

// WithChain returns a version of the rule with its Chain field set to chain.
func (r Rule) WithChain(chain string) Rule {
        wc := r
        wc.Args = slices.Clone(r.Args)
        wc.Chain = chain
        return wc
}

// Append appends the rule to the end of the chain. If the rule already exists anywhere in the
// chain, this is a no-op.
func (r Rule) Append() error {
        if r.Exists() {
                return nil
        }
        return r.exec(Append)
}

// Insert inserts the rule at the head of the chain. If the rule already exists anywhere in the
// chain, this is a no-op.
func (r Rule) Insert() error {
        if r.Exists() {
                return nil
        }
        return r.exec(Insert)
}

// Delete deletes the rule from the kernel. If the rule does not exist, this is a no-op.
func (r Rule) Delete() error {
        if !r.Exists() {
                return nil
        }
        return r.exec(Delete)
}

func (r Rule) String() string {
        cmd := append([]string{"iptables"}, r.cmdArgs("-A")...)
        if r.IPVer == IPv6 {
                cmd[0] = "ip6tables"
        }
        return strings.Join(cmd, " ")
}

package netlabel

const (
        // Prefix constant marks the reserved label space for libnetwork
        Prefix = "com.docker.network"

        // DriverPrefix constant marks the reserved label space for libnetwork drivers
        DriverPrefix = Prefix + ".driver"

        // DriverPrivatePrefix constant marks the reserved label space
        // for internal libnetwork drivers
        DriverPrivatePrefix = DriverPrefix + ".private"

        // GenericData constant that helps to identify an option as a Generic constant
        GenericData = Prefix + ".generic"

        // PortMap constant represents Port Mapping
        PortMap = Prefix + ".portmap"

        // MacAddress constant represents Mac Address config of a Container
        MacAddress = Prefix + ".endpoint.macaddress"

        // ExposedPorts constant represents the container's Exposed Ports
        ExposedPorts = Prefix + ".endpoint.exposedports"

        // DNSServers A list of DNS servers associated with the endpoint
        DNSServers = Prefix + ".endpoint.dnsservers"

        // EndpointSysctls is a comma separated list interface-specific sysctls
        // where the interface name is represented by the string "IFNAME".
        EndpointSysctls = Prefix + ".endpoint.sysctls"

        // Ifname can be used to set the interface name used inside the container. It takes precedence over ContainerIfacePrefix.
        Ifname = Prefix + ".endpoint.ifname"

        // EnableIPv4 constant represents enabling IPV4 at network level
        EnableIPv4 = Prefix + ".enable_ipv4"

        // EnableIPv6 constant represents enabling IPV6 at network level
        EnableIPv6 = Prefix + ".enable_ipv6"

        // DriverMTU constant represents the MTU size for the network driver
        DriverMTU = DriverPrefix + ".mtu"

        // AdvertiseAddrNMsgs is the number of unsolicited ARP/NA messages that will be sent to
        // advertise an interface's IP and MAC addresses.
        AdvertiseAddrNMsgs = Prefix + ".advertise_addr_nmsgs"

        // AdvertiseAddrIntervalMs is the minimum interval between ARP/NA advertisements for
        // an interface's IP and MAC addresses (in milliseconds).
        AdvertiseAddrIntervalMs = Prefix + ".advertise_addr_ms"

        // OverlayVxlanIDList constant represents a list of VXLAN Ids as csv
        OverlayVxlanIDList = DriverPrefix + ".overlay.vxlanid_list"

        // Gateway represents the gateway for the network
        Gateway = Prefix + ".gateway"

        // Internal constant represents that the network is internal which disables default gateway service
        Internal = Prefix + ".internal"

        // ContainerIfacePrefix can be used to override the interface prefix used inside the container
        ContainerIfacePrefix = Prefix + ".container_iface_prefix"

        // HostIPv4 is the Source-IPv4 Address used to SNAT IPv4 container traffic
        HostIPv4 = Prefix + ".host_ipv4"

        // HostIPv6 is the Source-IPv6 Address used to SNAT IPv6 container traffic
        HostIPv6 = Prefix + ".host_ipv6"

        // NoProxy6To4 disables proxying from an IPv6 host port to an IPv4-only
        // container, when the default binding address is 0.0.0.0. This label
        // is intended for internal use, it may be removed in a future release.
        NoProxy6To4 = DriverPrivatePrefix + ".no_proxy_6to4"
)

// GetIfname returns the value associated to the Ifname netlabel from the
// provided options. If there's no Ifname netlabel, or if the value isn't a
// string, it returns an empty string.
func GetIfname(opts map[string]interface{}) string {
        ifname, _ := opts[Ifname].(string)
        return ifname
}

// Package netutils provides network utility functions.
package netutils

import (
        "context"
        "crypto/rand"
        "encoding/hex"
        "fmt"
        "io"
        "net"
        "strings"
        "sync"

        "github.com/containerd/log"
)

// GenerateMACFromIP returns a locally administered MAC address where the 4 least
// significant bytes are derived from the IPv4 address.
func GenerateMACFromIP(ip net.IP) net.HardwareAddr {
        hw := make(net.HardwareAddr, 6)
        // The first byte of the MAC address has to comply with these rules:
        // 1. Unicast: Set the least-significant bit to 0.
        // 2. Address is locally administered: Set the second-least-significant bit (U/L) to 1.
        hw[0] = 0x02
        // The first 24 bits of the MAC represent the Organizationally Unique Identifier (OUI).
        // Since this address is locally administered, we can do whatever we want as long as
        // it doesn't conflict with other addresses.
        hw[1] = 0x42
        // Fill the remaining 4 bytes based on the input
        if ip == nil {
                rand.Read(hw[2:])
        } else {
                copy(hw[2:], ip.To4())
        }
        return hw
}

// GenerateRandomMAC returns a new 6-byte(48-bit) hardware address (MAC)
// that is not multicast and has the local assignment bit set.
func GenerateRandomMAC() net.HardwareAddr {
        hw := make(net.HardwareAddr, 6)
        rand.Read(hw)
        hw[0] &= 0xfe // Unicast: clear multicast bit
        hw[0] |= 0x02 // Locally administered: set local assignment bit
        return hw
}

// GenerateRandomName returns a string of the specified length, created by joining the prefix to random hex characters.
// The length must be strictly larger than len(prefix), or an error will be returned.
func GenerateRandomName(prefix string, length int) (string, error) {
        if length <= len(prefix) {
                return "", fmt.Errorf("invalid length %d for prefix %s", length, prefix)
        }

        // We add 1 here as integer division will round down, and we want to round up.
        b := make([]byte, (length-len(prefix)+1)/2)
        if _, err := io.ReadFull(rand.Reader, b); err != nil {
                return "", err
        }

        // By taking a slice here, we ensure that the string is always the correct length.
        return (prefix + hex.EncodeToString(b))[:length], nil
}

// ReverseIP accepts a V4 or V6 IP string in the canonical form and returns a reversed IP in
// the dotted decimal form . This is used to setup the IP to service name mapping in the optimal
// way for the DNS PTR queries.
func ReverseIP(IP string) string {
        var reverseIP []string

        if net.ParseIP(IP).To4() != nil {
                reverseIP = strings.Split(IP, ".")
                l := len(reverseIP)
                for i, j := 0, l-1; i < l/2; i, j = i+1, j-1 {
                        reverseIP[i], reverseIP[j] = reverseIP[j], reverseIP[i]
                }
        } else {
                reverseIP = strings.Split(IP, ":")

                // Reversed IPv6 is represented in dotted decimal instead of the typical
                // colon hex notation
                for key := range reverseIP {
                        if reverseIP[key] == "" { // expand the compressed 0s
                                reverseIP[key] = strings.Repeat("0000", 8-strings.Count(IP, ":"))
                        } else if len(reverseIP[key]) < 4 { // 0-padding needed
                                reverseIP[key] = strings.Repeat("0", 4-len(reverseIP[key])) + reverseIP[key]
                        }
                }

                reverseIP = strings.Split(strings.Join(reverseIP, ""), "")

                l := len(reverseIP)
                for i, j := 0, l-1; i < l/2; i, j = i+1, j-1 {
                        reverseIP[i], reverseIP[j] = reverseIP[j], reverseIP[i]
                }
        }

        return strings.Join(reverseIP, ".")
}

var (
        v6ListenableCached bool
        v6ListenableOnce   sync.Once
)

// IsV6Listenable returns true when `[::1]:0` is listenable.
// IsV6Listenable returns false mostly when the kernel was booted with `ipv6.disable=1` option.
func IsV6Listenable() bool {
        v6ListenableOnce.Do(func() {
                ln, err := net.Listen("tcp6", "[::1]:0")
                if err != nil {
                        // When the kernel was booted with `ipv6.disable=1`,
                        // we get err "listen tcp6 [::1]:0: socket: address family not supported by protocol"
                        // https://github.com/moby/moby/issues/42288
                        log.G(context.TODO()).Debugf("v6Listenable=false (%v)", err)
                } else {
                        v6ListenableCached = true
                        ln.Close()
                }
        })
        return v6ListenableCached
}

// MustParseMAC returns a net.HardwareAddr or panic.
func MustParseMAC(s string) net.HardwareAddr {
        mac, err := net.ParseMAC(s)
        if err != nil {
                panic(err)
        }
        return mac
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23 && linux

package netutils

import (
        "net/netip"
        "os"
        "slices"

        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/resolvconf"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/pkg/errors"
        "github.com/vishvananda/netlink"
)

// InferReservedNetworks returns a list of network prefixes that seem to be
// used by the system and that would likely break it if they were assigned to
// some Docker networks. It uses two heuristics to build that list:
//
// 1. Nameservers configured in /etc/resolv.conf ;
// 2. On-link routes ;
//
// That 2nd heuristic was originally not limited to on-links -- all non-default
// routes were checked (see [1]). This proved to be not ideal at best and
// highly problematic at worst:
//
//   - VPN software and appliances doing split tunneling might push a small set
//     of routes for large, aggregated prefixes to avoid maintenance and
//     potential issues whenever a new subnet comes into use on internal
//     network. However, not all subnets from these aggregates might be in use.
//   - For full tunneling, especially when implemented with OpenVPN, the
//     situation is even worse as the host might end up with the two following
//     routes: 0.0.0.0/1 and 128.0.0.0/1. They are functionally
//     indistinguishable from a default route, yet the Engine was treating them
//     differently. With those routes, there was no way to use dynamic subnet
//     allocation at all. (see 'def1' on [2])
//   - A subnet covered by the default route can be used, or not. Same for
//     non-default and non-on-link routes. The type of route says little about
//     the availability of subnets it covers, except for on-link routes as they
//     specifically define what subnet the current host is part of.
//
// The 2nd heuristic was modified to be limited to on-link routes in PR #42598
// (first released in v23.0, see [3]).
//
// If these heuristics don't detect an overlap, users should change their daemon
// config to remove that overlapping prefix from `default-address-pools`. If a
// prefix is found to overlap but users care enough about it being associated
// to a Docker network they can still rely on static allocation.
//
// For IPv6, the 2nd heuristic isn't applied as there's no such thing as
// on-link routes for IPv6.
//
// [1]: https://github.com/moby/libnetwork/commit/56832d6d89bf0f9d5280849026ee25ae4ae5f22e
// [2]: https://community.openvpn.net/openvpn/wiki/Openvpn23ManPage
// [3]: https://github.com/moby/moby/pull/42598
func InferReservedNetworks(v6 bool) []netip.Prefix {
        var reserved []netip.Prefix

        // We don't really care if os.ReadFile fails here. It either doesn't exist,
        // or we can't read it for some reason.
        if rc, err := os.ReadFile(resolvconf.Path()); err == nil {
                reserved = slices.DeleteFunc(resolvconf.GetNameserversAsPrefix(rc), func(p netip.Prefix) bool {
                        return p.Addr().Is6() != v6
                })
        }

        if !v6 {
                reserved = append(reserved, queryOnLinkRoutes()...)
        }

        slices.SortFunc(reserved, netiputil.PrefixCompare)
        return reserved
}

// queryOnLinkRoutes returns a list of on-link routes available on the host.
// Only IPv4 prefixes are returned as there's no such thing as on-link
// routes for IPv6.
func queryOnLinkRoutes() []netip.Prefix {
        routes, err := ns.NlHandle().RouteList(nil, netlink.FAMILY_V4)
        if err != nil {
                return nil
        }

        var prefixes []netip.Prefix
        for _, route := range routes {
                if route.Scope == netlink.SCOPE_LINK && route.Dst != nil && !route.Dst.IP.IsUnspecified() {
                        if p, ok := netiputil.ToPrefix(route.Dst); ok {
                                prefixes = append(prefixes, p)
                        }
                }
        }

        return prefixes
}

// GenerateIfaceName returns an interface name using the passed in
// prefix and the length of random bytes. The api ensures that the
// there is no interface which exists with that name.
func GenerateIfaceName(nlh nlwrap.Handle, prefix string, length int) (string, error) {
        for i := 0; i < 3; i++ {
                name, err := GenerateRandomName(prefix, length)
                if err != nil {
                        return "", err
                }
                if nlh.Handle == nil {
                        _, err = nlwrap.LinkByName(name)
                } else {
                        _, err = nlh.LinkByName(name)
                }
                if err != nil {
                        if errors.As(err, &netlink.LinkNotFoundError{}) {
                                return name, nil
                        }
                        return "", err
                }
        }
        return "", types.InternalErrorf("could not generate interface name")
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package libnetwork

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "net/netip"
        "runtime"
        "strings"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/driverapi"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/internal/setmatrix"
        "github.com/docker/docker/daemon/libnetwork/ipamapi"
        "github.com/docker/docker/daemon/libnetwork/ipams/defaultipam"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/networkdb"
        "github.com/docker/docker/daemon/libnetwork/options"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/internal/sliceutil"
        "github.com/docker/docker/pkg/stringid"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

// EndpointWalker is a client provided function which will be used to walk the Endpoints.
// When the function returns true, the walk will stop.
type EndpointWalker func(ep *Endpoint) bool

// ipInfo is the reverse mapping from IP to service name to serve the PTR query.
// extResolver is set if an external server resolves a service name to this IP.
// It's an indication to defer PTR queries also to that external server.
type ipInfo struct {
        name        string
        serviceID   string
        extResolver bool
}

// svcMapEntry is the body of the element into the svcMap
// The ip is a string because the SetMatrix does not accept non hashable values
type svcMapEntry struct {
        ip        string
        serviceID string
}

type svcInfo struct {
        svcMap     setmatrix.SetMatrix[string, svcMapEntry]
        svcIPv6Map setmatrix.SetMatrix[string, svcMapEntry]
        ipMap      setmatrix.SetMatrix[string, ipInfo]
        service    map[string][]servicePorts
}

// backing container or host's info
type serviceTarget struct {
        name string
        ip   net.IP
        port uint16
}

type servicePorts struct {
        portName string
        proto    string
        target   []serviceTarget
}

type networkDBTable struct {
        name    string
        objType driverapi.ObjectType
}

// IpamConf contains all the ipam related configurations for a network
//
// TODO(aker): use proper net/* structs instead of string literals.
type IpamConf struct {
        // PreferredPool is the master address pool for containers and network interfaces.
        PreferredPool string
        // SubPool is a subset of the master pool. If specified,
        // this becomes the container pool for automatic address allocations.
        SubPool string
        // Gateway is the preferred Network Gateway address (optional).
        Gateway string
        // AuxAddresses contains auxiliary addresses for network driver. Must be within the master pool.
        // libnetwork will reserve them if they fall into the container pool.
        AuxAddresses map[string]string
}

// Validate checks whether the configuration is valid
func (c *IpamConf) Validate() error {
        if c.Gateway != "" && net.ParseIP(c.Gateway) == nil {
                return types.InvalidParameterErrorf("invalid gateway address %s in Ipam configuration", c.Gateway)
        }
        return nil
}

// Contains checks whether the ipam master address pool contains [addr].
func (c *IpamConf) Contains(addr net.IP) bool {
        if c == nil {
                return false
        }
        if c.PreferredPool == "" {
                return false
        }

        _, allowedRange, _ := net.ParseCIDR(c.PreferredPool)

        return allowedRange.Contains(addr)
}

// IsStatic checks whether the subnet was statically allocated (ie. user-defined).
func (c *IpamConf) IsStatic() bool {
        return c != nil && c.PreferredPool != ""
}

// IpamInfo contains all the ipam related operational info for a network
type IpamInfo struct {
        PoolID string
        Meta   map[string]string
        driverapi.IPAMData
}

// MarshalJSON encodes IpamInfo into json message
func (i *IpamInfo) MarshalJSON() ([]byte, error) {
        m := map[string]any{
                "PoolID": i.PoolID,
        }
        v, err := json.Marshal(&i.IPAMData)
        if err != nil {
                return nil, err
        }
        m["IPAMData"] = string(v)

        if i.Meta != nil {
                m["Meta"] = i.Meta
        }
        return json.Marshal(m)
}

// UnmarshalJSON decodes json message into PoolData
func (i *IpamInfo) UnmarshalJSON(data []byte) error {
        var (
                m   map[string]any
                err error
        )
        if err = json.Unmarshal(data, &m); err != nil {
                return err
        }
        i.PoolID = m["PoolID"].(string)
        if v, ok := m["Meta"]; ok {
                b, _ := json.Marshal(v) //nolint:errchkjson // FIXME: handle json (Un)Marshal errors
                if err = json.Unmarshal(b, &i.Meta); err != nil {
                        return err
                }
        }
        if v, ok := m["IPAMData"]; ok {
                if err = json.Unmarshal([]byte(v.(string)), &i.IPAMData); err != nil {
                        return err
                }
        }
        return nil
}

// Network represents a logical connectivity zone that containers may
// join using the Link method. A network is managed by a specific driver.
type Network struct {
        ctrlr            *Controller
        name             string
        networkType      string // networkType is the name of the netdriver used by this network
        id               string
        created          time.Time
        scope            string // network data scope
        labels           map[string]string
        ipamType         string // ipamType is the name of the IPAM driver
        ipamOptions      map[string]string
        addrSpace        string
        ipamV4Config     []*IpamConf
        ipamV6Config     []*IpamConf
        ipamV4Info       []*IpamInfo
        ipamV6Info       []*IpamInfo
        enableIPv4       bool
        enableIPv6       bool
        generic          options.Generic
        dbIndex          uint64
        dbExists         bool
        persist          bool
        drvOnce          *sync.Once
        resolver         []*Resolver
        internal         bool
        attachable       bool
        inDelete         bool
        ingress          bool
        driverTables     []networkDBTable
        dynamic          bool
        configOnly       bool
        configFrom       string
        loadBalancerIP   net.IP
        loadBalancerMode string
        skipGwAllocIPv4  bool
        skipGwAllocIPv6  bool
        platformNetwork  //nolint:nolintlint,unused // only populated on windows
        mu               sync.Mutex
}

const (
        loadBalancerModeNAT     = "NAT"
        loadBalancerModeDSR     = "DSR"
        loadBalancerModeDefault = loadBalancerModeNAT
)

// Name returns a user chosen name for this network.
func (n *Network) Name() string {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.name
}

// ID returns a system generated id for this network.
func (n *Network) ID() string {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.id
}

func (n *Network) Created() time.Time {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.created
}

// Type returns the type of network, which corresponds to its managing driver.
func (n *Network) Type() string {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.networkType
}

func (n *Network) Resolvers() []*Resolver {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.resolver
}

func (n *Network) Key() []string {
        n.mu.Lock()
        defer n.mu.Unlock()
        return []string{datastore.NetworkKeyPrefix, n.id}
}

func (n *Network) KeyPrefix() []string {
        return []string{datastore.NetworkKeyPrefix}
}

func (n *Network) Value() []byte {
        n.mu.Lock()
        defer n.mu.Unlock()
        b, err := json.Marshal(n)
        if err != nil {
                return nil
        }
        return b
}

func (n *Network) SetValue(value []byte) error {
        return json.Unmarshal(value, n)
}

func (n *Network) Index() uint64 {
        n.mu.Lock()
        defer n.mu.Unlock()
        return n.dbIndex
}

func (n *Network) SetIndex(index uint64) {
        n.mu.Lock()
        n.dbIndex = index
        n.dbExists = true
        n.mu.Unlock()
}

func (n *Network) Exists() bool {
        n.mu.Lock()
        defer n.mu.Unlock()
        return n.dbExists
}

func (n *Network) Skip() bool {
        n.mu.Lock()
        defer n.mu.Unlock()
        return !n.persist
}

func (n *Network) New() datastore.KVObject {
        n.mu.Lock()
        defer n.mu.Unlock()

        return &Network{
                ctrlr:   n.ctrlr,
                drvOnce: &sync.Once{},
                scope:   n.scope,
        }
}

// CopyTo deep copies to the destination IpamConfig
func (c *IpamConf) CopyTo(dstC *IpamConf) error {
        dstC.PreferredPool = c.PreferredPool
        dstC.SubPool = c.SubPool
        dstC.Gateway = c.Gateway
        if c.AuxAddresses != nil {
                dstC.AuxAddresses = make(map[string]string, len(c.AuxAddresses))
                for k, v := range c.AuxAddresses {
                        dstC.AuxAddresses[k] = v
                }
        }
        return nil
}

// CopyTo deep copies to the destination IpamInfo
func (i *IpamInfo) CopyTo(dstI *IpamInfo) error {
        dstI.PoolID = i.PoolID
        if i.Meta != nil {
                dstI.Meta = make(map[string]string)
                for k, v := range i.Meta {
                        dstI.Meta[k] = v
                }
        }

        dstI.AddressSpace = i.AddressSpace
        dstI.Pool = types.GetIPNetCopy(i.Pool)
        dstI.Gateway = types.GetIPNetCopy(i.Gateway)

        if i.AuxAddresses != nil {
                dstI.AuxAddresses = make(map[string]*net.IPNet)
                for k, v := range i.AuxAddresses {
                        dstI.AuxAddresses[k] = types.GetIPNetCopy(v)
                }
        }

        return nil
}

func (n *Network) validateConfiguration() error {
        if n.configOnly {
                // Only supports network specific configurations.
                // Network operator configurations are not supported.
                if n.ingress || n.internal || n.attachable || n.scope != "" {
                        return types.ForbiddenErrorf("configuration network can only contain network " +
                                "specific fields. Network operator fields like " +
                                "[ ingress | internal | attachable | scope ] are not supported.")
                }
        }
        if n.configFrom == "" {
                if err := n.validateAdvertiseAddrConfig(); err != nil {
                        return err
                }
        } else {
                if n.configOnly {
                        return types.ForbiddenErrorf("a configuration network cannot depend on another configuration network")
                }
                // Check that no config has been set for this --config-from network.
                // (Note that the default for enableIPv4 is 'true', ipamType has its own default,
                // and other settings are zero valued by default.)
                if n.ipamType != "" &&
                        n.ipamType != defaultIpamForNetworkType(n.networkType) ||
                        !n.enableIPv4 || n.enableIPv6 ||
                        len(n.labels) > 0 || len(n.ipamOptions) > 0 ||
                        len(n.ipamV4Config) > 0 || len(n.ipamV6Config) > 0 {
                        return types.ForbiddenErrorf("user specified configurations are not supported if the network depends on a configuration network")
                }
                if len(n.generic) > 0 {
                        if data, ok := n.generic[netlabel.GenericData]; ok {
                                var (
                                        driverOptions map[string]string
                                        opts          any
                                )
                                switch t := data.(type) {
                                case map[string]any, map[string]string:
                                        opts = t
                                }
                                ba, err := json.Marshal(opts)
                                if err != nil {
                                        return fmt.Errorf("failed to validate network configuration: %v", err)
                                }
                                if err := json.Unmarshal(ba, &driverOptions); err != nil {
                                        return fmt.Errorf("failed to validate network configuration: %v", err)
                                }
                                if len(driverOptions) > 0 {
                                        return types.ForbiddenErrorf("network driver options are not supported if the network depends on a configuration network")
                                }
                        }
                }
        }
        return nil
}

// applyConfigurationTo applies network specific configurations.
func (n *Network) applyConfigurationTo(to *Network) error {
        to.enableIPv4 = n.enableIPv4
        to.enableIPv6 = n.enableIPv6
        if len(n.labels) > 0 {
                to.labels = make(map[string]string, len(n.labels))
                for k, v := range n.labels {
                        if _, ok := to.labels[k]; !ok {
                                to.labels[k] = v
                        }
                }
        }
        if n.ipamType != "" {
                to.ipamType = n.ipamType
        }
        if len(n.ipamOptions) > 0 {
                to.ipamOptions = make(map[string]string, len(n.ipamOptions))
                for k, v := range n.ipamOptions {
                        if _, ok := to.ipamOptions[k]; !ok {
                                to.ipamOptions[k] = v
                        }
                }
        }
        if len(n.ipamV4Config) > 0 {
                to.ipamV4Config = make([]*IpamConf, 0, len(n.ipamV4Config))
                to.ipamV4Config = append(to.ipamV4Config, n.ipamV4Config...)
        }
        if len(n.ipamV6Config) > 0 {
                to.ipamV6Config = make([]*IpamConf, 0, len(n.ipamV6Config))
                to.ipamV6Config = append(to.ipamV6Config, n.ipamV6Config...)
        }
        if len(n.generic) > 0 {
                to.generic = options.Generic{}
                for k, v := range n.generic {
                        to.generic[k] = v
                }
        }

        // Network drivers only see generic flags. So, make sure they match.
        if to.generic == nil {
                to.generic = options.Generic{}
        }
        to.generic[netlabel.Internal] = to.internal
        to.generic[netlabel.EnableIPv4] = to.enableIPv4
        to.generic[netlabel.EnableIPv6] = to.enableIPv6

        return nil
}

func (n *Network) CopyTo(o datastore.KVObject) error {
        n.mu.Lock()
        defer n.mu.Unlock()

        dstN := o.(*Network)
        dstN.name = n.name
        dstN.id = n.id
        dstN.created = n.created
        dstN.networkType = n.networkType
        dstN.scope = n.scope
        dstN.dynamic = n.dynamic
        dstN.ipamType = n.ipamType
        dstN.enableIPv4 = n.enableIPv4
        dstN.enableIPv6 = n.enableIPv6
        dstN.persist = n.persist
        dstN.dbIndex = n.dbIndex
        dstN.dbExists = n.dbExists
        dstN.drvOnce = n.drvOnce
        dstN.internal = n.internal
        dstN.attachable = n.attachable
        dstN.inDelete = n.inDelete
        dstN.ingress = n.ingress
        dstN.configOnly = n.configOnly
        dstN.configFrom = n.configFrom
        dstN.loadBalancerIP = n.loadBalancerIP
        dstN.loadBalancerMode = n.loadBalancerMode
        dstN.skipGwAllocIPv4 = n.skipGwAllocIPv4
        dstN.skipGwAllocIPv6 = n.skipGwAllocIPv6

        // copy labels
        if dstN.labels == nil {
                dstN.labels = make(map[string]string, len(n.labels))
        }
        for k, v := range n.labels {
                dstN.labels[k] = v
        }

        if n.ipamOptions != nil {
                dstN.ipamOptions = make(map[string]string, len(n.ipamOptions))
                for k, v := range n.ipamOptions {
                        dstN.ipamOptions[k] = v
                }
        }

        for _, v4conf := range n.ipamV4Config {
                dstV4Conf := &IpamConf{}
                if err := v4conf.CopyTo(dstV4Conf); err != nil {
                        return err
                }
                dstN.ipamV4Config = append(dstN.ipamV4Config, dstV4Conf)
        }

        for _, v4info := range n.ipamV4Info {
                dstV4Info := &IpamInfo{}
                if err := v4info.CopyTo(dstV4Info); err != nil {
                        return err
                }
                dstN.ipamV4Info = append(dstN.ipamV4Info, dstV4Info)
        }

        for _, v6conf := range n.ipamV6Config {
                dstV6Conf := &IpamConf{}
                if err := v6conf.CopyTo(dstV6Conf); err != nil {
                        return err
                }
                dstN.ipamV6Config = append(dstN.ipamV6Config, dstV6Conf)
        }

        for _, v6info := range n.ipamV6Info {
                dstV6Info := &IpamInfo{}
                if err := v6info.CopyTo(dstV6Info); err != nil {
                        return err
                }
                dstN.ipamV6Info = append(dstN.ipamV6Info, dstV6Info)
        }

        dstN.generic = options.Generic{}
        for k, v := range n.generic {
                dstN.generic[k] = v
        }

        return nil
}

func (n *Network) validateAdvertiseAddrConfig() error {
        var errs []error
        _, err := n.validatedAdvertiseAddrNMsgs()
        errs = append(errs, err)
        _, err = n.validatedAdvertiseAddrInterval()
        errs = append(errs, err)
        return errors.Join(errs...)
}

func (n *Network) advertiseAddrNMsgs() (int, bool) {
        v, err := n.validatedAdvertiseAddrNMsgs()
        if err != nil || v == nil {
                // On Linux, config was validated before network creation. This
                // path is for un-set values and unsupported platforms.
                return 0, false
        }
        return *v, true
}

func (n *Network) advertiseAddrInterval() (time.Duration, bool) {
        v, err := n.validatedAdvertiseAddrInterval()
        if err != nil || v == nil {
                // On Linux, config was validated before network creation. This
                // path is for un-set values and unsupported platforms.
                return 0, false
        }
        return *v, true
}

func (n *Network) MarshalJSON() ([]byte, error) {
        // TODO: Can be made much more generic with the help of reflection (but has some golang limitations)
        netMap := make(map[string]any)
        netMap["name"] = n.name
        netMap["id"] = n.id
        netMap["created"] = n.created
        netMap["networkType"] = n.networkType
        netMap["scope"] = n.scope
        netMap["labels"] = n.labels
        netMap["ipamType"] = n.ipamType
        netMap["ipamOptions"] = n.ipamOptions
        netMap["addrSpace"] = n.addrSpace
        netMap["enableIPv4"] = n.enableIPv4
        netMap["enableIPv6"] = n.enableIPv6
        if n.generic != nil {
                netMap["generic"] = n.generic
        }
        netMap["persist"] = n.persist
        if len(n.ipamV4Config) > 0 {
                ics, err := json.Marshal(n.ipamV4Config)
                if err != nil {
                        return nil, err
                }
                netMap["ipamV4Config"] = string(ics)
        }
        if len(n.ipamV4Info) > 0 {
                iis, err := json.Marshal(n.ipamV4Info)
                if err != nil {
                        return nil, err
                }
                netMap["ipamV4Info"] = string(iis)
        }
        if len(n.ipamV6Config) > 0 {
                ics, err := json.Marshal(n.ipamV6Config)
                if err != nil {
                        return nil, err
                }
                netMap["ipamV6Config"] = string(ics)
        }
        if len(n.ipamV6Info) > 0 {
                iis, err := json.Marshal(n.ipamV6Info)
                if err != nil {
                        return nil, err
                }
                netMap["ipamV6Info"] = string(iis)
        }
        netMap["internal"] = n.internal
        netMap["attachable"] = n.attachable
        netMap["inDelete"] = n.inDelete
        netMap["ingress"] = n.ingress
        netMap["configOnly"] = n.configOnly
        netMap["configFrom"] = n.configFrom
        netMap["loadBalancerIP"] = n.loadBalancerIP
        netMap["loadBalancerMode"] = n.loadBalancerMode
        netMap["skipGwAllocIPv4"] = n.skipGwAllocIPv4
        netMap["skipGwAllocIPv6"] = n.skipGwAllocIPv6
        return json.Marshal(netMap)
}

func (n *Network) UnmarshalJSON(b []byte) (err error) {
        // TODO: Can be made much more generic with the help of reflection (but has some golang limitations)
        var netMap map[string]any
        if err := json.Unmarshal(b, &netMap); err != nil {
                return err
        }
        n.name = netMap["name"].(string)
        n.id = netMap["id"].(string)
        // "created" is not available in older versions
        if v, ok := netMap["created"]; ok {
                // n.created is time.Time but marshalled as string
                if err = n.created.UnmarshalText([]byte(v.(string))); err != nil {
                        log.G(context.TODO()).Warnf("failed to unmarshal creation time %v: %v", v, err)
                        n.created = time.Time{}
                }
        }
        n.networkType = netMap["networkType"].(string)
        n.enableIPv4 = true // Default for networks created before the option to disable IPv4 was added.
        if v, ok := netMap["enableIPv4"]; ok {
                n.enableIPv4 = v.(bool)
        }
        n.enableIPv6 = netMap["enableIPv6"].(bool)

        // if we weren't unmarshaling to netMap we could simply set n.labels
        // unfortunately, we can't because map[string]interface{} != map[string]string
        if labels, ok := netMap["labels"].(map[string]any); ok {
                n.labels = make(map[string]string, len(labels))
                for label, value := range labels {
                        n.labels[label] = value.(string)
                }
        }

        if v, ok := netMap["ipamOptions"]; ok {
                if iOpts, ok := v.(map[string]any); ok {
                        n.ipamOptions = make(map[string]string, len(iOpts))
                        for k, v := range iOpts {
                                n.ipamOptions[k] = v.(string)
                        }
                }
        }

        if v, ok := netMap["generic"]; ok {
                n.generic = v.(map[string]any)
                // Restore opts in their map[string]string form
                if gv, ok := n.generic[netlabel.GenericData]; ok {
                        var lmap map[string]string
                        ba, err := json.Marshal(gv)
                        if err != nil {
                                return err
                        }
                        if err := json.Unmarshal(ba, &lmap); err != nil {
                                return err
                        }
                        n.generic[netlabel.GenericData] = lmap
                }
        }
        if v, ok := netMap["persist"]; ok {
                n.persist = v.(bool)
        }
        if v, ok := netMap["ipamType"]; ok {
                n.ipamType = v.(string)
        } else {
                n.ipamType = defaultipam.DriverName
        }
        if v, ok := netMap["addrSpace"]; ok {
                n.addrSpace = v.(string)
        }
        if v, ok := netMap["ipamV4Config"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &n.ipamV4Config); err != nil {
                        return err
                }
        }
        if v, ok := netMap["ipamV4Info"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &n.ipamV4Info); err != nil {
                        return err
                }
        }
        if v, ok := netMap["ipamV6Config"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &n.ipamV6Config); err != nil {
                        return err
                }
        }
        if v, ok := netMap["ipamV6Info"]; ok {
                if err := json.Unmarshal([]byte(v.(string)), &n.ipamV6Info); err != nil {
                        return err
                }
        }
        if v, ok := netMap["internal"]; ok {
                n.internal = v.(bool)
        }
        if v, ok := netMap["attachable"]; ok {
                n.attachable = v.(bool)
        }
        if s, ok := netMap["scope"]; ok {
                n.scope = s.(string)
        }
        if v, ok := netMap["inDelete"]; ok {
                n.inDelete = v.(bool)
        }
        if v, ok := netMap["ingress"]; ok {
                n.ingress = v.(bool)
        }
        if v, ok := netMap["configOnly"]; ok {
                n.configOnly = v.(bool)
        }
        if v, ok := netMap["configFrom"]; ok {
                n.configFrom = v.(string)
        }
        if v, ok := netMap["loadBalancerIP"]; ok {
                n.loadBalancerIP = net.ParseIP(v.(string))
        }
        n.loadBalancerMode = loadBalancerModeDefault
        if v, ok := netMap["loadBalancerMode"]; ok {
                n.loadBalancerMode = v.(string)
        }
        if v, ok := netMap["skipGwAllocIPv4"]; ok {
                n.skipGwAllocIPv4 = v.(bool)
        }
        if v, ok := netMap["skipGwAllocIPv6"]; ok {
                n.skipGwAllocIPv6 = v.(bool)
        }
        return nil
}

// NetworkOption is an option setter function type used to pass various options to
// NewNetwork method. The various setter functions of type NetworkOption are
// provided by libnetwork, they look like NetworkOptionXXXX(...)
type NetworkOption func(n *Network)

// NetworkOptionGeneric function returns an option setter for a Generic option defined
// in a Dictionary of Key-Value pair
func NetworkOptionGeneric(generic map[string]any) NetworkOption {
        return func(n *Network) {
                if n.generic == nil {
                        n.generic = make(map[string]any)
                }
                if val, ok := generic[netlabel.EnableIPv4]; ok {
                        n.enableIPv4 = val.(bool)
                }
                if val, ok := generic[netlabel.EnableIPv6]; ok {
                        n.enableIPv6 = val.(bool)
                }
                if val, ok := generic[netlabel.Internal]; ok {
                        n.internal = val.(bool)
                }
                for k, v := range generic {
                        n.generic[k] = v
                }
        }
}

// NetworkOptionIngress returns an option setter to indicate if a network is
// an ingress network.
func NetworkOptionIngress(ingress bool) NetworkOption {
        return func(n *Network) {
                n.ingress = ingress
        }
}

// NetworkOptionPersist returns an option setter to set persistence policy for a network
func NetworkOptionPersist(persist bool) NetworkOption {
        return func(n *Network) {
                n.persist = persist
        }
}

// NetworkOptionEnableIPv4 returns an option setter to explicitly configure IPv4
func NetworkOptionEnableIPv4(enableIPv4 bool) NetworkOption {
        return func(n *Network) {
                if n.generic == nil {
                        n.generic = make(map[string]any)
                }
                n.enableIPv4 = enableIPv4
                n.generic[netlabel.EnableIPv4] = enableIPv4
        }
}

// NetworkOptionEnableIPv6 returns an option setter to explicitly configure IPv6
func NetworkOptionEnableIPv6(enableIPv6 bool) NetworkOption {
        return func(n *Network) {
                if n.generic == nil {
                        n.generic = make(map[string]any)
                }
                n.enableIPv6 = enableIPv6
                n.generic[netlabel.EnableIPv6] = enableIPv6
        }
}

// NetworkOptionInternalNetwork returns an option setter to config the network
// to be internal which disables default gateway service
func NetworkOptionInternalNetwork() NetworkOption {
        return func(n *Network) {
                if n.generic == nil {
                        n.generic = make(map[string]any)
                }
                n.internal = true
                n.generic[netlabel.Internal] = true
        }
}

// NetworkOptionAttachable returns an option setter to set attachable for a network
func NetworkOptionAttachable(attachable bool) NetworkOption {
        return func(n *Network) {
                n.attachable = attachable
        }
}

// NetworkOptionScope returns an option setter to overwrite the network's scope.
// By default the network's scope is set to the network driver's datascope.
func NetworkOptionScope(scope string) NetworkOption {
        return func(n *Network) {
                n.scope = scope
        }
}

// NetworkOptionIpam function returns an option setter for the ipam configuration for this network
func NetworkOptionIpam(ipamDriver string, addrSpace string, ipV4 []*IpamConf, ipV6 []*IpamConf, opts map[string]string) NetworkOption {
        return func(n *Network) {
                if ipamDriver != "" {
                        n.ipamType = ipamDriver
                        if ipamDriver == defaultipam.DriverName {
                                n.ipamType = defaultIpamForNetworkType(n.Type())
                        }
                }
                n.ipamOptions = opts
                n.addrSpace = addrSpace
                n.ipamV4Config = ipV4
                n.ipamV6Config = ipV6
        }
}

// NetworkOptionLBEndpoint function returns an option setter for the configuration of the load balancer endpoint for this network
func NetworkOptionLBEndpoint(ip net.IP) NetworkOption {
        return func(n *Network) {
                n.loadBalancerIP = ip
        }
}

// NetworkOptionDriverOpts function returns an option setter for any driver parameter described by a map
func NetworkOptionDriverOpts(opts map[string]string) NetworkOption {
        return func(n *Network) {
                if n.generic == nil {
                        n.generic = make(map[string]any)
                }
                if opts == nil {
                        opts = make(map[string]string)
                }
                // Store the options
                n.generic[netlabel.GenericData] = opts
        }
}

// NetworkOptionLabels function returns an option setter for labels specific to a network
func NetworkOptionLabels(labels map[string]string) NetworkOption {
        return func(n *Network) {
                n.labels = labels
        }
}

// NetworkOptionDynamic function returns an option setter for dynamic option for a network
func NetworkOptionDynamic() NetworkOption {
        return func(n *Network) {
                n.dynamic = true
        }
}

// NetworkOptionConfigOnly tells controller this network is
// a configuration only network. It serves as a configuration
// for other networks.
func NetworkOptionConfigOnly() NetworkOption {
        return func(n *Network) {
                n.configOnly = true
        }
}

// NetworkOptionConfigFrom tells controller to pick the
// network configuration from a configuration only network
func NetworkOptionConfigFrom(name string) NetworkOption {
        return func(n *Network) {
                n.configFrom = name
        }
}

func (n *Network) processOptions(options ...NetworkOption) {
        for _, opt := range options {
                if opt != nil {
                        opt(n)
                }
        }
}

type networkDeleteParams struct {
        rmLBEndpoint bool
}

// NetworkDeleteOption is a type for optional parameters to pass to the
// Network.Delete() function.
type NetworkDeleteOption func(p *networkDeleteParams)

// NetworkDeleteOptionRemoveLB informs a Network.Delete() operation that should
// remove the load balancer endpoint for this network.  Note that the Delete()
// method will automatically remove a load balancing endpoint for most networks
// when the network is otherwise empty.  However, this does not occur for some
// networks.  In particular, networks marked as ingress (which are supposed to
// be more permanent than other overlay networks) won't automatically remove
// the LB endpoint on Delete().  This method allows for explicit removal of
// such networks provided there are no other endpoints present in the network.
// If the network still has non-LB endpoints present, Delete() will not
// remove the LB endpoint and will return an error.
func NetworkDeleteOptionRemoveLB(p *networkDeleteParams) {
        p.rmLBEndpoint = true
}

func (n *Network) resolveDriver(name string, load bool) (driverapi.Driver, driverapi.Capability, error) {
        c := n.getController()

        // Check if a driver for the specified network type is available
        d, capabilities := c.drvRegistry.Driver(name)
        if d == nil {
                if load {
                        err := c.loadDriver(name)
                        if err != nil {
                                return nil, driverapi.Capability{}, err
                        }

                        d, capabilities = c.drvRegistry.Driver(name)
                        if d == nil {
                                return nil, driverapi.Capability{}, fmt.Errorf("could not resolve driver %s in registry", name)
                        }
                } else {
                        // don't fail if driver loading is not required
                        return nil, driverapi.Capability{}, nil
                }
        }

        return d, capabilities, nil
}

func (n *Network) driverIsMultihost() bool {
        _, capabilities, err := n.resolveDriver(n.networkType, true)
        if err != nil {
                return false
        }
        return capabilities.ConnectivityScope == scope.Global
}

func (n *Network) driver(load bool) (driverapi.Driver, error) {
        d, capabilities, err := n.resolveDriver(n.networkType, load)
        if err != nil {
                return nil, err
        }

        n.mu.Lock()
        // If load is not required, driver, cap and err may all be nil
        if n.scope == "" {
                n.scope = capabilities.DataScope
        }
        if n.dynamic {
                // If the network is dynamic, then it is swarm
                // scoped regardless of the backing driver.
                n.scope = scope.Swarm
        }
        n.mu.Unlock()
        return d, nil
}

// Delete the network.
func (n *Network) Delete(options ...NetworkDeleteOption) error {
        var params networkDeleteParams
        for _, opt := range options {
                opt(&params)
        }
        return n.delete(false, params.rmLBEndpoint)
}

// This function gets called in 3 ways:
//   - Delete() -- (false, false)
//     remove if endpoint count == 0 or endpoint count == 1 and
//     there is a load balancer IP
//   - Delete(libnetwork.NetworkDeleteOptionRemoveLB) -- (false, true)
//     remove load balancer and network if endpoint count == 1
//   - controller.networkCleanup() -- (true, true)
//     remove the network no matter what
func (n *Network) delete(force bool, rmLBEndpoint bool) error {
        n.mu.Lock()
        c := n.ctrlr
        name := n.name
        id := n.id
        n.mu.Unlock()

        c.networkLocker.Lock(id)
        defer c.networkLocker.Unlock(id) //nolint:errcheck

        n, err := c.getNetworkFromStore(id)
        if err != nil {
                return errdefs.NotFound(fmt.Errorf("unknown network %s id %s", name, id))
        }

        // Only remove ingress on force removal or explicit LB endpoint removal
        if n.ingress && !force && !rmLBEndpoint {
                return &ActiveEndpointsError{name: n.name, id: n.id}
        }

        if !force && n.configOnly {
                refNws := c.findNetworks(filterNetworkByConfigFrom(n.name))
                if len(refNws) > 0 {
                        return types.ForbiddenErrorf("configuration network %q is in use", n.Name())
                }
        }

        // Check that the network is empty
        var emptyCount int
        if n.hasLoadBalancerEndpoint() {
                emptyCount = 1
        }
        eps := c.findEndpoints(filterEndpointByNetworkId(n.id))
        if !force && len(eps) > emptyCount {
                return &ActiveEndpointsError{
                        name: n.name,
                        id:   n.id,
                        endpoints: sliceutil.Map(eps, func(ep *Endpoint) string {
                                return fmt.Sprintf(`name:%q id:%q`, ep.name, stringid.TruncateID(ep.id))
                        }),
                }
        }

        if n.hasLoadBalancerEndpoint() {
                // If we got to this point, then the following must hold:
                //  * force is true OR endpoint count == 1
                if err := n.deleteLoadBalancerSandbox(); err != nil {
                        if !force {
                                return err
                        }
                        // continue deletion when force is true even on error
                        log.G(context.TODO()).Warnf("Error deleting load balancer sandbox: %v", err)
                }
        }

        // Up to this point, errors that we returned were recoverable.
        // From here on, any errors leave us in an inconsistent state.
        // This is unfortunate, but there isn't a safe way to
        // reconstitute a load-balancer endpoint after removing it.

        // Mark the network for deletion
        n.inDelete = true
        if err = c.storeNetwork(context.TODO(), n); err != nil {
                return fmt.Errorf("error marking network %s (%s) for deletion: %v", n.Name(), n.ID(), err)
        }

        if n.configOnly {
                goto removeFromStore
        }

        n.ipamRelease()

        // We are about to delete the network. Leave the gossip
        // cluster for the network to stop all incoming network
        // specific gossip updates before cleaning up all the service
        // bindings for the network. But cleanup service binding
        // before deleting the network from the store since service
        // bindings cleanup requires the network in the store.
        n.cancelDriverWatches()
        if err = n.leaveCluster(); err != nil {
                log.G(context.TODO()).Errorf("Failed leaving network %s from the agent cluster: %v", n.Name(), err)
        }

        // Cleanup the service discovery for this network
        c.cleanupServiceDiscovery(n.ID())

        // Cleanup the load balancer. On Windows this call is required
        // to remove remote loadbalancers in VFP, and must be performed before
        // dataplane network deletion.
        if runtime.GOOS == "windows" {
                c.cleanupServiceBindings(n.ID())
        }

        // Delete the network from the dataplane
        if err = n.deleteNetwork(); err != nil {
                if !force {
                        return err
                }
                log.G(context.TODO()).Debugf("driver failed to delete stale network %s (%s): %v", n.Name(), n.ID(), err)
        }

removeFromStore:
        // deleteFromStore performs an atomic delete operation and the
        // Network.epCnt will help prevent any possible
        // race between endpoint join and network delete
        //
        // TODO(robmry) - remove this once downgrade past 28.1.0 is no longer supported.
        // The endpoint count is no longer used, it's created in the store to make
        // downgrade work, versions older than 28.1.0 expect to read it and error if they
        // can't. The stored count is not maintained, so the downgraded version will
        // always find it's zero (which is usually correct because the daemon had
        // stopped), but older daemons fix it on startup anyway.
        if err = c.deleteFromStore(&endpointCnt{n: n}); err != nil {
                log.G(context.TODO()).Debugf("Error deleting endpoint count from store for stale network %s (%s) for deletion: %v", n.Name(), n.ID(), err)
        }

        if err = c.deleteStoredNetwork(n); err != nil {
                return fmt.Errorf("error deleting network from store: %v", err)
        }

        return nil
}

func (n *Network) deleteNetwork() error {
        d, err := n.driver(true)
        if err != nil {
                return fmt.Errorf("failed deleting Network: %v", err)
        }

        if err := d.DeleteNetwork(n.ID()); err != nil {
                // Forbidden Errors should be honored
                if _, ok := err.(types.ForbiddenError); ok {
                        return err
                }

                if _, ok := err.(types.MaskableError); !ok {
                        log.G(context.TODO()).Warnf("driver error deleting network %s : %v", n.name, err)
                }
        }

        for _, resolver := range n.resolver {
                resolver.Stop()
        }
        return nil
}

func (n *Network) addEndpoint(ctx context.Context, ep *Endpoint) error {
        d, err := n.driver(true)
        if err != nil {
                return fmt.Errorf("failed to add endpoint: %v", err)
        }

        err = d.CreateEndpoint(ctx, n.id, ep.id, ep.Iface(), ep.generic)
        if err != nil {
                return types.InternalErrorf("failed to create endpoint %s on network %s: %v",
                        ep.Name(), n.Name(), err)
        }

        return nil
}

// CreateEndpoint creates a new endpoint to this network symbolically identified by the
// specified unique name. The options parameter carries driver specific options.
func (n *Network) CreateEndpoint(ctx context.Context, name string, options ...EndpointOption) (*Endpoint, error) {
        var err error
        if strings.TrimSpace(name) == "" {
                return nil, types.InvalidParameterErrorf("invalid name: name is empty")
        }

        if n.ConfigOnly() {
                return nil, types.ForbiddenErrorf("cannot create endpoint on configuration-only network")
        }

        if _, err = n.EndpointByName(name); err == nil {
                return nil, types.ForbiddenErrorf("endpoint with name %s already exists in network %s", name, n.Name())
        }

        n.ctrlr.networkLocker.Lock(n.id)
        defer n.ctrlr.networkLocker.Unlock(n.id) //nolint:errcheck

        return n.createEndpoint(ctx, name, options...)
}

func (n *Network) createEndpoint(ctx context.Context, name string, options ...EndpointOption) (*Endpoint, error) {
        var err error

        ep := &Endpoint{name: name, generic: make(map[string]any), iface: &EndpointInterface{}}
        ep.id = stringid.GenerateRandomID()

        // Initialize ep.network with a possibly stale copy of n. We need this to get network from
        // store. But once we get it from store we will have the most uptodate copy possibly.
        ep.network = n
        ep.network, err = ep.getNetworkFromStore()
        if err != nil {
                log.G(ctx).Errorf("failed to get network during CreateEndpoint: %v", err)
                return nil, err
        }
        n = ep.network

        ep.processOptions(options...)

        for _, llIPNet := range ep.Iface().LinkLocalAddresses() {
                if !llIPNet.IP.IsLinkLocalUnicast() {
                        return nil, types.InvalidParameterErrorf("invalid link local IP address: %v", llIPNet.IP)
                }
        }

        if opt, ok := ep.generic[netlabel.MacAddress]; ok {
                if mac, ok := opt.(net.HardwareAddr); ok {
                        ep.iface.mac = mac
                }
        }

        ipam, capability, err := n.getController().getIPAMDriver(n.ipamType)
        if err != nil {
                return nil, err
        }

        if capability.RequiresMACAddress {
                if ep.iface.mac == nil {
                        ep.iface.mac = netutils.GenerateRandomMAC()
                }
                if ep.ipamOptions == nil {
                        ep.ipamOptions = make(map[string]string)
                }
                ep.ipamOptions[netlabel.MacAddress] = ep.iface.mac.String()
        }

        wantIPv6 := n.enableIPv6 && !ep.disableIPv6

        if err = ep.assignAddress(ipam, n.enableIPv4, wantIPv6); err != nil {
                return nil, err
        }
        defer func() {
                if err != nil {
                        ep.releaseAddress()
                }
        }()

        if err = n.addEndpoint(ctx, ep); err != nil {
                return nil, err
        }
        defer func() {
                if err != nil {
                        if e := ep.deleteEndpoint(false); e != nil {
                                log.G(ctx).Warnf("cleaning up endpoint failed %s : %v", name, e)
                        }
                }
        }()

        // We should perform storeEndpoint call right after addEndpoint
        // in order to have iface properly configured
        if err = n.getController().storeEndpoint(ctx, ep); err != nil {
                return nil, err
        }
        defer func() {
                if err != nil {
                        if e := n.getController().deleteStoredEndpoint(ep); e != nil {
                                log.G(ctx).Warnf("error rolling back endpoint %s from store: %v", name, e)
                        }
                }
        }()

        if !n.getController().isSwarmNode() || n.Scope() != scope.Swarm || !n.driverIsMultihost() {
                n.updateSvcRecord(context.WithoutCancel(ctx), ep, true)
                defer func() {
                        if err != nil {
                                n.updateSvcRecord(context.WithoutCancel(ctx), ep, false)
                        }
                }()
        }

        return ep, nil
}

// Endpoints returns the list of Endpoint(s) in this network.
func (n *Network) Endpoints() []*Endpoint {
        endpoints, err := n.getEndpointsFromStore()
        if err != nil {
                log.G(context.TODO()).Error(err)
        }
        return endpoints
}

// WalkEndpoints uses the provided function to walk the Endpoints.
func (n *Network) WalkEndpoints(walker EndpointWalker) {
        for _, e := range n.Endpoints() {
                if walker(e) {
                        return
                }
        }
}

// EndpointByName returns the Endpoint which has the passed name. If not found,
// an [errdefs.ErrNotFound] is returned.
func (n *Network) EndpointByName(name string) (*Endpoint, error) {
        if name == "" {
                return nil, types.InvalidParameterErrorf("invalid name: name is empty")
        }
        var e *Endpoint

        s := func(current *Endpoint) bool {
                if current.Name() == name {
                        e = current
                        return true
                }
                return false
        }

        n.WalkEndpoints(s)

        if e == nil {
                return nil, errdefs.NotFound(fmt.Errorf("endpoint %s not found", name))
        }

        return e, nil
}

// updateSvcRecord adds or deletes local DNS records for a given Endpoint.
func (n *Network) updateSvcRecord(ctx context.Context, ep *Endpoint, isAdd bool) {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.updateSvcRecord", trace.WithAttributes(
                attribute.String("ep.name", ep.name),
                attribute.Bool("isAdd", isAdd)))
        defer span.End()

        iface := ep.Iface()
        if iface == nil {
                return
        }

        var ipv4, ipv6 net.IP
        if iface.Address() != nil {
                ipv4 = iface.Address().IP
        }
        if iface.AddressIPv6() != nil {
                ipv6 = iface.AddressIPv6().IP
        }

        serviceID := ep.svcID
        if serviceID == "" {
                serviceID = ep.ID()
        }

        dnsNames := ep.getDNSNames()
        if isAdd {
                for i, dnsName := range dnsNames {
                        ipMapUpdate := i == 0 // ipMapUpdate indicates whether PTR records should be updated.
                        n.addSvcRecords(ep.ID(), dnsName, serviceID, ipv4, ipv6, ipMapUpdate, "updateSvcRecord")
                }
        } else {
                for i, dnsName := range dnsNames {
                        ipMapUpdate := i == 0 // ipMapUpdate indicates whether PTR records should be updated.
                        n.deleteSvcRecords(ep.ID(), dnsName, serviceID, ipv4, ipv6, ipMapUpdate, "updateSvcRecord")
                }
        }
}

func addIPToName(ipMap *setmatrix.SetMatrix[string, ipInfo], name, serviceID string, ip net.IP) {
        reverseIP := netutils.ReverseIP(ip.String())
        ipMap.Insert(reverseIP, ipInfo{
                name:      name,
                serviceID: serviceID,
        })
}

func delIPToName(ipMap *setmatrix.SetMatrix[string, ipInfo], name, serviceID string, ip net.IP) {
        reverseIP := netutils.ReverseIP(ip.String())
        ipMap.Remove(reverseIP, ipInfo{
                name:      name,
                serviceID: serviceID,
        })
}

func addNameToIP(svcMap *setmatrix.SetMatrix[string, svcMapEntry], name, serviceID string, epIP net.IP) {
        // Since DNS name resolution is case-insensitive, Use the lower-case form
        // of the name as the key into svcMap
        lowerCaseName := strings.ToLower(name)
        svcMap.Insert(lowerCaseName, svcMapEntry{
                ip:        epIP.String(),
                serviceID: serviceID,
        })
}

func delNameToIP(svcMap *setmatrix.SetMatrix[string, svcMapEntry], name, serviceID string, epIP net.IP) {
        lowerCaseName := strings.ToLower(name)
        svcMap.Remove(lowerCaseName, svcMapEntry{
                ip:        epIP.String(),
                serviceID: serviceID,
        })
}

// TODO(aker): remove ipMapUpdate param and add a proper method dedicated to update PTR records.
func (n *Network) addSvcRecords(eID, name, serviceID string, epIPv4, epIPv6 net.IP, ipMapUpdate bool, method string) {
        // Do not add service names for ingress network as this is a
        // routing only network
        if n.ingress {
                return
        }
        networkID := n.ID()
        log.G(context.TODO()).Debugf("%s (%.7s).addSvcRecords(%s, %s, %s, %t) %s sid:%s", eID, networkID, name, epIPv4, epIPv6, ipMapUpdate, method, serviceID)

        c := n.getController()
        c.mu.Lock()
        defer c.mu.Unlock()

        sr, ok := c.svcRecords[networkID]
        if !ok {
                sr = &svcInfo{}
                c.svcRecords[networkID] = sr
        }

        if ipMapUpdate {
                if epIPv4 != nil {
                        addIPToName(&sr.ipMap, name, serviceID, epIPv4)
                }
                if epIPv6 != nil {
                        addIPToName(&sr.ipMap, name, serviceID, epIPv6)
                }
        }

        if epIPv4 != nil {
                addNameToIP(&sr.svcMap, name, serviceID, epIPv4)
        }
        if epIPv6 != nil {
                addNameToIP(&sr.svcIPv6Map, name, serviceID, epIPv6)
        }
}

func (n *Network) deleteSvcRecords(eID, name, serviceID string, epIPv4, epIPv6 net.IP, ipMapUpdate bool, method string) {
        // Do not delete service names from ingress network as this is a
        // routing only network
        if n.ingress {
                return
        }
        networkID := n.ID()
        log.G(context.TODO()).Debugf("%s (%.7s).deleteSvcRecords(%s, %s, %s, %t) %s sid:%s ", eID, networkID, name, epIPv4, epIPv6, ipMapUpdate, method, serviceID)

        c := n.getController()
        c.mu.Lock()
        defer c.mu.Unlock()

        sr, ok := c.svcRecords[networkID]
        if !ok {
                return
        }

        if ipMapUpdate {
                if epIPv4 != nil {
                        delIPToName(&sr.ipMap, name, serviceID, epIPv4)
                }
                if epIPv6 != nil {
                        delIPToName(&sr.ipMap, name, serviceID, epIPv6)
                }
        }

        if epIPv4 != nil {
                delNameToIP(&sr.svcMap, name, serviceID, epIPv4)
        }
        if epIPv6 != nil {
                delNameToIP(&sr.svcIPv6Map, name, serviceID, epIPv6)
        }
}

func (n *Network) getController() *Controller {
        n.mu.Lock()
        defer n.mu.Unlock()
        return n.ctrlr
}

func (n *Network) ipamAllocate() (retErr error) {
        if n.hasSpecialDriver() {
                return nil
        }

        ipam, _, err := n.getController().getIPAMDriver(n.ipamType)
        if err != nil {
                return err
        }

        if n.addrSpace == "" {
                if n.addrSpace, err = n.deriveAddressSpace(); err != nil {
                        return err
                }
        }

        if n.enableIPv4 {
                if err := n.ipamAllocateVersion(4, ipam); err != nil {
                        return err
                }
                defer func() {
                        if retErr != nil {
                                n.ipamReleaseVersion(4, ipam)
                        }
                }()
        }

        if n.enableIPv6 {
                if err := n.ipamAllocateVersion(6, ipam); err != nil {
                        return err
                }
        }

        return nil
}

func (n *Network) ipamAllocateVersion(ipVer int, ipam ipamapi.Ipam) error {
        var (
                cfgList     *[]*IpamConf
                infoList    *[]*IpamInfo
                skipGwAlloc bool
                err         error
        )

        switch ipVer {
        case 4:
                cfgList = &n.ipamV4Config
                infoList = &n.ipamV4Info
                skipGwAlloc = n.skipGwAllocIPv4
        case 6:
                cfgList = &n.ipamV6Config
                infoList = &n.ipamV6Info
                skipGwAlloc = n.skipGwAllocIPv6
        default:
                return types.InternalErrorf("incorrect ip version passed to ipam allocate: %d", ipVer)
        }

        if len(*cfgList) == 0 {
                *cfgList = []*IpamConf{{}}
        }

        *infoList = make([]*IpamInfo, len(*cfgList))

        log.G(context.TODO()).Debugf("Allocating IPv%d pools for network %s (%s)", ipVer, n.Name(), n.ID())

        for i, cfg := range *cfgList {
                if err = cfg.Validate(); err != nil {
                        return err
                }
                d := &IpamInfo{}
                (*infoList)[i] = d

                d.AddressSpace = n.addrSpace

                var reserved []netip.Prefix
                if n.Scope() != scope.Global {
                        reserved = netutils.InferReservedNetworks(ipVer == 6)
                }

                alloc, err := ipam.RequestPool(ipamapi.PoolRequest{
                        AddressSpace: n.addrSpace,
                        Pool:         cfg.PreferredPool,
                        SubPool:      cfg.SubPool,
                        Options:      n.ipamOptions,
                        Exclude:      reserved,
                        V6:           ipVer == 6,
                })
                if err != nil {
                        return err
                }

                d.PoolID = alloc.PoolID
                d.Pool = netiputil.ToIPNet(alloc.Pool)
                d.Meta = alloc.Meta

                defer func() {
                        if err != nil {
                                if err := ipam.ReleasePool(d.PoolID); err != nil {
                                        log.G(context.TODO()).Warnf("Failed to release address pool %s after failure to create network %s (%s)", d.PoolID, n.Name(), n.ID())
                                }
                        }
                }()

                // If there's no user-configured gateway address but the IPAM driver returned a gw when it
                // set up the pool, use it. (It doesn't need to be requested/reserved in IPAM.)
                if cfg.Gateway == "" {
                        if gws, ok := d.Meta[netlabel.Gateway]; ok {
                                if d.Gateway, err = types.ParseCIDR(gws); err != nil {
                                        return types.InvalidParameterErrorf("failed to parse gateway address (%v) returned by ipam driver: %v", gws, err)
                                }
                        }
                }

                // If there's still no gateway, reserve cfg.Gateway if the user specified it. Else,
                // if the driver wants a gateway, let the IPAM driver select an address.
                if d.Gateway == nil && (cfg.Gateway != "" || !skipGwAlloc) {
                        gatewayOpts := map[string]string{
                                ipamapi.RequestAddressType: netlabel.Gateway,
                        }
                        if d.Gateway, _, err = ipam.RequestAddress(d.PoolID, net.ParseIP(cfg.Gateway), gatewayOpts); err != nil {
                                return types.InternalErrorf("failed to allocate gateway (%v): %v", cfg.Gateway, err)
                        }
                }

                // Auxiliary addresses must be part of the master address pool
                // If they fall into the container addressable pool, libnetwork will reserve them
                if cfg.AuxAddresses != nil {
                        var ip net.IP
                        d.IPAMData.AuxAddresses = make(map[string]*net.IPNet, len(cfg.AuxAddresses))
                        for k, v := range cfg.AuxAddresses {
                                if ip = net.ParseIP(v); ip == nil {
                                        return types.InvalidParameterErrorf("non parsable secondary ip address (%s:%s) passed for network %s", k, v, n.Name())
                                }
                                if !d.Pool.Contains(ip) {
                                        return types.ForbiddenErrorf("auxiliary address: (%s:%s) must belong to the master pool: %s", k, v, d.Pool)
                                }
                                // Attempt reservation in the container addressable pool, silent the error if address does not belong to that pool
                                if d.IPAMData.AuxAddresses[k], _, err = ipam.RequestAddress(d.PoolID, ip, nil); err != nil && !errors.Is(err, ipamapi.ErrIPOutOfRange) {
                                        return types.InternalErrorf("failed to allocate secondary ip address (%s:%s): %v", k, v, err)
                                }
                        }
                }
        }

        return nil
}

func (n *Network) ipamRelease() {
        if n.hasSpecialDriver() {
                return
        }
        ipam, _, err := n.getController().getIPAMDriver(n.ipamType)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to retrieve ipam driver to release address pool(s) on delete of network %s (%s): %v", n.Name(), n.ID(), err)
                return
        }
        n.ipamReleaseVersion(4, ipam)
        n.ipamReleaseVersion(6, ipam)
}

func (n *Network) ipamReleaseVersion(ipVer int, ipam ipamapi.Ipam) {
        var infoList *[]*IpamInfo

        switch ipVer {
        case 4:
                infoList = &n.ipamV4Info
        case 6:
                infoList = &n.ipamV6Info
        default:
                log.G(context.TODO()).Warnf("incorrect ip version passed to ipam release: %d", ipVer)
                return
        }

        if len(*infoList) == 0 {
                return
        }

        log.G(context.TODO()).Debugf("releasing IPv%d pools from network %s (%s)", ipVer, n.Name(), n.ID())

        for _, d := range *infoList {
                if d.Gateway != nil {
                        // FIXME(robmry) - if an IPAM driver returned a gateway in Meta[netlabel.Gateway], and
                        // no user config overrode that address, it wasn't explicitly allocated so it shouldn't
                        // be released here?
                        if err := ipam.ReleaseAddress(d.PoolID, d.Gateway.IP); err != nil {
                                log.G(context.TODO()).Warnf("Failed to release gateway ip address %s on delete of network %s (%s): %v", d.Gateway.IP, n.Name(), n.ID(), err)
                        }
                }
                if d.IPAMData.AuxAddresses != nil {
                        for k, nw := range d.IPAMData.AuxAddresses {
                                if d.Pool.Contains(nw.IP) {
                                        if err := ipam.ReleaseAddress(d.PoolID, nw.IP); err != nil && !errors.Is(err, ipamapi.ErrIPOutOfRange) {
                                                log.G(context.TODO()).Warnf("Failed to release secondary ip address %s (%v) on delete of network %s (%s): %v", k, nw.IP, n.Name(), n.ID(), err)
                                        }
                                }
                        }
                }
                if err := ipam.ReleasePool(d.PoolID); err != nil {
                        log.G(context.TODO()).Warnf("Failed to release address pool %s on delete of network %s (%s): %v", d.PoolID, n.Name(), n.ID(), err)
                }
        }

        *infoList = nil
}

func (n *Network) getIPInfo(ipVer int) []*IpamInfo {
        var info []*IpamInfo
        switch ipVer {
        case 4:
                info = n.ipamV4Info
        case 6:
                info = n.ipamV6Info
        default:
                return nil
        }
        l := make([]*IpamInfo, 0, len(info))
        n.mu.Lock()
        l = append(l, info...)
        n.mu.Unlock()
        return l
}

func (n *Network) getIPData(ipVer int) []driverapi.IPAMData {
        var info []*IpamInfo
        switch ipVer {
        case 4:
                info = n.ipamV4Info
        case 6:
                info = n.ipamV6Info
        default:
                return nil
        }
        l := make([]driverapi.IPAMData, 0, len(info))
        n.mu.Lock()
        for _, d := range info {
                l = append(l, d.IPAMData)
        }
        n.mu.Unlock()
        return l
}

func (n *Network) deriveAddressSpace() (string, error) {
        ipam, _ := n.getController().ipamRegistry.IPAM(n.ipamType)
        if ipam == nil {
                return "", types.NotFoundErrorf("failed to get default address space: unknown ipam type %q", n.ipamType)
        }
        local, global, err := ipam.GetDefaultAddressSpaces()
        if err != nil {
                return "", types.NotFoundErrorf("failed to get default address space: %v", err)
        }
        if n.Scope() == scope.Global {
                return global, nil
        }
        return local, nil
}

// Peers returns a slice of PeerInfo structures which has the information about the peer
// nodes participating in the same overlay network. This is currently the per-network
// gossip cluster. For non-dynamic overlay networks and bridge networks it returns an
// empty slice
func (n *Network) Peers() []networkdb.PeerInfo {
        if !n.Dynamic() {
                return []networkdb.PeerInfo{}
        }

        a := n.getController().getAgent()
        if a == nil {
                return []networkdb.PeerInfo{}
        }

        return a.networkDB.Peers(n.ID())
}

func (n *Network) DriverOptions() map[string]string {
        n.mu.Lock()
        defer n.mu.Unlock()
        if n.generic != nil {
                if m, ok := n.generic[netlabel.GenericData]; ok {
                        return m.(map[string]string)
                }
        }
        return map[string]string{}
}

func (n *Network) Scope() string {
        n.mu.Lock()
        defer n.mu.Unlock()
        return n.scope
}

func (n *Network) IpamConfig() (ipamType string, ipamOptions map[string]string, ipamV4Config []*IpamConf, ipamV6Config []*IpamConf) {
        n.mu.Lock()
        defer n.mu.Unlock()

        ipamV4Config = make([]*IpamConf, len(n.ipamV4Config))
        for i, c := range n.ipamV4Config {
                cc := &IpamConf{}
                if err := c.CopyTo(cc); err != nil {
                        log.G(context.TODO()).WithError(err).Error("Error copying ipam ipv4 config")
                }
                ipamV4Config[i] = cc
        }

        ipamV6Config = make([]*IpamConf, len(n.ipamV6Config))
        for i, c := range n.ipamV6Config {
                cc := &IpamConf{}
                if err := c.CopyTo(cc); err != nil {
                        log.G(context.TODO()).WithError(err).Debug("Error copying ipam ipv6 config")
                }
                ipamV6Config[i] = cc
        }

        return n.ipamType, n.ipamOptions, ipamV4Config, ipamV6Config
}

func (n *Network) IpamInfo() (ipamV4Info []*IpamInfo, ipamV6Info []*IpamInfo) {
        n.mu.Lock()
        defer n.mu.Unlock()

        ipamV4Info = make([]*IpamInfo, len(n.ipamV4Info))
        for i, info := range n.ipamV4Info {
                ic := &IpamInfo{}
                if err := info.CopyTo(ic); err != nil {
                        log.G(context.TODO()).WithError(err).Error("Error copying IPv4 IPAM config")
                }
                ipamV4Info[i] = ic
        }

        ipamV6Info = make([]*IpamInfo, len(n.ipamV6Info))
        for i, info := range n.ipamV6Info {
                ic := &IpamInfo{}
                if err := info.CopyTo(ic); err != nil {
                        log.G(context.TODO()).WithError(err).Error("Error copying IPv6 IPAM config")
                }
                ipamV6Info[i] = ic
        }

        return ipamV4Info, ipamV6Info
}

func (n *Network) Internal() bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.internal
}

func (n *Network) Attachable() bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.attachable
}

func (n *Network) Ingress() bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.ingress
}

func (n *Network) Dynamic() bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.dynamic
}

func (n *Network) IPv4Enabled() bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.enableIPv4
}

func (n *Network) IPv6Enabled() bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.enableIPv6
}

func (n *Network) ConfigFrom() string {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.configFrom
}

func (n *Network) ConfigOnly() bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.configOnly
}

func (n *Network) Labels() map[string]string {
        n.mu.Lock()
        defer n.mu.Unlock()

        lbls := make(map[string]string, len(n.labels))
        for k, v := range n.labels {
                lbls[k] = v
        }

        return lbls
}

func (n *Network) TableEventRegister(tableName string, objType driverapi.ObjectType) error {
        if !driverapi.IsValidType(objType) {
                return fmt.Errorf("invalid object type %v in registering table, %s", objType, tableName)
        }

        t := networkDBTable{
                name:    tableName,
                objType: objType,
        }
        n.mu.Lock()
        defer n.mu.Unlock()
        n.driverTables = append(n.driverTables, t)
        return nil
}

func (n *Network) UpdateIpamConfig(ipV4Data []driverapi.IPAMData) {
        ipamV4Config := make([]*IpamConf, len(ipV4Data))

        for i, data := range ipV4Data {
                ic := &IpamConf{}
                ic.PreferredPool = data.Pool.String()
                ic.Gateway = data.Gateway.IP.String()
                ipamV4Config[i] = ic
        }

        n.mu.Lock()
        defer n.mu.Unlock()
        n.ipamV4Config = ipamV4Config
}

// Special drivers are ones which do not need to perform any Network plumbing
func (n *Network) hasSpecialDriver() bool {
        return n.Type() == "host" || n.Type() == "null"
}

func (n *Network) hasLoadBalancerEndpoint() bool {
        return len(n.loadBalancerIP) != 0
}

// ResolveName looks up addresses of ipType for name req.
// Returns (addresses, true) if req is found, but len(addresses) may be 0 if
// there are no addresses of ipType. If the name is not found, the bool return
// will be false.
func (n *Network) ResolveName(ctx context.Context, req string, ipType int) ([]net.IP, bool) {
        c := n.getController()
        networkID := n.ID()

        _, span := otel.Tracer("").Start(ctx, "Network.ResolveName", trace.WithAttributes(
                attribute.String("libnet.network.name", n.Name()),
                attribute.String("libnet.network.id", networkID),
        ))
        defer span.End()

        c.mu.Lock()
        // TODO(aker): release the lock earlier
        defer c.mu.Unlock()
        sr, ok := c.svcRecords[networkID]
        if !ok {
                return nil, false
        }

        req = strings.TrimSuffix(req, ".")
        req = strings.ToLower(req)

        ipSet, ok4 := sr.svcMap.Get(req)
        ipSet6, ok6 := sr.svcIPv6Map.Get(req)
        if !ok4 && !ok6 {
                // No result for v4 or v6, the name doesn't exist.
                return nil, false
        }
        if ipType == types.IPv6 {
                ipSet = ipSet6
        }

        // this map is to avoid IP duplicates, this can happen during a transition period where 2 services are using the same IP
        noDup := make(map[string]bool)
        var ipLocal []net.IP
        for _, ip := range ipSet {
                if _, dup := noDup[ip.ip]; !dup {
                        noDup[ip.ip] = true
                        ipLocal = append(ipLocal, net.ParseIP(ip.ip))
                }
        }
        return ipLocal, true
}

func (n *Network) HandleQueryResp(name string, ip net.IP) {
        networkID := n.ID()
        c := n.getController()
        c.mu.Lock()
        defer c.mu.Unlock()
        sr, ok := c.svcRecords[networkID]

        if !ok {
                return
        }

        ipStr := netutils.ReverseIP(ip.String())
        // If an object with extResolver == true is already in the set this call will fail
        // but anyway it means that has already been inserted before
        if ok, _ := sr.ipMap.Contains(ipStr, ipInfo{name: name}); ok {
                sr.ipMap.Remove(ipStr, ipInfo{name: name})
                sr.ipMap.Insert(ipStr, ipInfo{name: name, extResolver: true})
        }
}

func (n *Network) ResolveIP(_ context.Context, ip string) string {
        networkID := n.ID()
        c := n.getController()
        c.mu.Lock()
        defer c.mu.Unlock()
        sr, ok := c.svcRecords[networkID]

        if !ok {
                return ""
        }

        nwName := n.Name()

        elemSet, ok := sr.ipMap.Get(ip)
        if !ok || len(elemSet) == 0 {
                return ""
        }
        // NOTE it is possible to have more than one element in the Set, this will happen
        // because of interleave of different events from different sources (local container create vs
        // network db notifications)
        // In such cases the resolution will be based on the first element of the set, and can vary
        // during the system stabilization
        elem := elemSet[0]
        if elem.extResolver {
                return ""
        }

        return elem.name + "." + nwName
}

func (n *Network) ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP) {
        c := n.getController()

        srv := []*net.SRV{}
        ip := []net.IP{}

        log.G(ctx).Debugf("Service name To resolve: %v", name)

        // There are DNS implementations that allow SRV queries for names not in
        // the format defined by RFC 2782. Hence specific validations checks are
        // not done
        parts := strings.Split(name, ".")
        if len(parts) < 3 {
                return nil, nil
        }

        portName := parts[0]
        proto := parts[1]
        svcName := strings.Join(parts[2:], ".")

        networkID := n.ID()
        c.mu.Lock()
        defer c.mu.Unlock()
        sr, ok := c.svcRecords[networkID]

        if !ok {
                return nil, nil
        }

        svcs, ok := sr.service[svcName]
        if !ok {
                return nil, nil
        }

        for _, svc := range svcs {
                if svc.portName != portName {
                        continue
                }
                if svc.proto != proto {
                        continue
                }
                for _, t := range svc.target {
                        srv = append(srv,
                                &net.SRV{
                                        Target: t.name,
                                        Port:   t.port,
                                })

                        ip = append(ip, t.ip)
                }
        }

        return srv, ip
}

func (n *Network) NdotsSet() bool {
        return false
}

// config-only network is looked up by name
func (c *Controller) getConfigNetwork(name string) (*Network, error) {
        var n *Network
        c.WalkNetworks(func(current *Network) bool {
                if current.ConfigOnly() && current.Name() == name {
                        n = current
                        return true
                }
                return false
        })

        if n == nil {
                return nil, types.NotFoundErrorf("configuration network %q not found", name)
        }

        return n, nil
}

func (n *Network) lbSandboxName() string {
        name := "lb-" + n.name
        if n.ingress {
                name = n.name + "-sbox"
        }
        return name
}

func (n *Network) lbEndpointName() string {
        return n.name + "-endpoint"
}

func (n *Network) createLoadBalancerSandbox() (retErr error) {
        sandboxName := n.lbSandboxName()
        // Mark the sandbox to be a load balancer
        sbOptions := []SandboxOption{OptionLoadBalancer(n.id)}
        if n.ingress {
                sbOptions = append(sbOptions, OptionIngress())
        }
        sb, err := n.ctrlr.NewSandbox(context.TODO(), sandboxName, sbOptions...)
        if err != nil {
                return err
        }
        defer func() {
                if retErr != nil {
                        if e := n.ctrlr.SandboxDestroy(context.WithoutCancel(context.TODO()), sandboxName); e != nil {
                                log.G(context.TODO()).Warnf("could not delete sandbox %s on failure on failure (%v): %v", sandboxName, retErr, e)
                        }
                }
        }()

        endpointName := n.lbEndpointName()
        epOptions := []EndpointOption{
                CreateOptionIpam(n.loadBalancerIP, nil, nil, nil),
                CreateOptionLoadBalancer(),
        }
        ep, err := n.createEndpoint(context.TODO(), endpointName, epOptions...)
        if err != nil {
                return err
        }
        defer func() {
                if retErr != nil {
                        if e := ep.Delete(context.WithoutCancel(context.TODO()), true); e != nil {
                                log.G(context.TODO()).Warnf("could not delete endpoint %s on failure on failure (%v): %v", endpointName, retErr, e)
                        }
                }
        }()

        if err := ep.Join(context.TODO(), sb, nil); err != nil {
                return err
        }

        return sb.EnableService()
}

func (n *Network) deleteLoadBalancerSandbox() error {
        n.mu.Lock()
        c := n.ctrlr
        name := n.name
        n.mu.Unlock()

        sandboxName := n.lbSandboxName()
        endpointName := n.lbEndpointName()

        endpoint, err := n.EndpointByName(endpointName)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to find load balancer endpoint %s on network %s: %v", endpointName, name, err)
        } else {
                info := endpoint.Info()
                if info != nil {
                        sb := info.Sandbox()
                        if sb != nil {
                                if err := sb.DisableService(); err != nil {
                                        log.G(context.TODO()).Warnf("Failed to disable service on sandbox %s: %v", sandboxName, err)
                                        // Ignore error and attempt to delete the load balancer endpoint
                                }
                        }
                }

                if err := endpoint.Delete(context.TODO(), true); err != nil {
                        log.G(context.TODO()).Warnf("Failed to delete endpoint %s (%s) in %s: %v", endpoint.Name(), endpoint.ID(), sandboxName, err)
                        // Ignore error and attempt to delete the sandbox.
                }
        }

        if err := c.SandboxDestroy(context.TODO(), sandboxName); err != nil {
                return fmt.Errorf("Failed to delete %s sandbox: %v", sandboxName, err)
        }
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package libnetwork

import (
        "context"

        "github.com/docker/docker/daemon/libnetwork/internal/maputil"
)

// storeNetwork inserts or updates the network in the store and the in-memory
// cache maintained by the Controller.
//
// This method is thread-safe.
func (c *Controller) storeNetwork(ctx context.Context, n *Network) error {
        if err := c.updateToStore(ctx, n); err != nil {
                return err
        }
        c.cacheNetwork(n)
        return nil
}

// deleteStoredNetwork deletes the network from the store and the in-memory
// cache maintained by the Controller.
//
// This method is thread-safe.
func (c *Controller) deleteStoredNetwork(n *Network) error {
        if err := c.deleteFromStore(n); err != nil {
                return err
        }

        c.networksMu.Lock()
        defer c.networksMu.Unlock()
        delete(c.networks, n.id)

        return nil
}

// cacheNetwork caches the network in the in-memory cache of networks
// maintained by the Controller.
//
// This method is thread-safe.
func (c *Controller) cacheNetwork(n *Network) {
        c.networksMu.Lock()
        defer c.networksMu.Unlock()
        c.networks[n.ID()] = n
}

// findNetworks looks for all networks matching the filter from the in-memory
// cache of networks maintained by the Controller.
//
// This method is thread-safe, but do not use it unless you're sure your code
// uses the returned networks in thread-safe way (see the comment on
// Controller.networks).
func (c *Controller) findNetworks(filter func(nw *Network) bool) []*Network {
        c.networksMu.Lock()
        defer c.networksMu.Unlock()
        return maputil.FilterValues(c.networks, filter)
}

func filterNetworkByConfigFrom(expected string) func(nw *Network) bool {
        return func(nw *Network) bool {
                return nw.configFrom == expected
        }
}

//go:build !windows

package libnetwork

import (
        "context"
        "fmt"
        "strconv"
        "time"

        "github.com/docker/docker/daemon/libnetwork/ipams/defaultipam"
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/network"
)

type platformNetwork struct{} //nolint:nolintlint,unused // only populated on windows

// Stub implementations for DNS related functions

func (n *Network) startResolver() {
}

func addEpToResolver(
        ctx context.Context,
        netName, epName string,
        config *containerConfig,
        epIface *EndpointInterface,
        resolvers []*Resolver,
) error {
        return nil
}

func deleteEpFromResolver(epName string, epIface *EndpointInterface, resolvers []*Resolver) error {
        return nil
}

func defaultIpamForNetworkType(networkType string) string {
        return defaultipam.DriverName
}

func (n *Network) validatedAdvertiseAddrNMsgs() (*int, error) {
        nMsgsStr, ok := n.DriverOptions()[netlabel.AdvertiseAddrNMsgs]
        if !ok {
                return nil, nil
        }
        nMsgs, err := strconv.Atoi(nMsgsStr)
        if err != nil {
                return nil, fmt.Errorf("value for option "+netlabel.AdvertiseAddrNMsgs+" %q must be an integer", nMsgsStr)
        }
        if nMsgs < osl.AdvertiseAddrNMsgsMin || nMsgs > osl.AdvertiseAddrNMsgsMax {
                return nil, fmt.Errorf(netlabel.AdvertiseAddrNMsgs+" must be in the range %d to %d",
                        osl.AdvertiseAddrNMsgsMin, osl.AdvertiseAddrNMsgsMax)
        }
        return &nMsgs, nil
}

func (n *Network) validatedAdvertiseAddrInterval() (*time.Duration, error) {
        intervalStr, ok := n.DriverOptions()[netlabel.AdvertiseAddrIntervalMs]
        if !ok {
                return nil, nil
        }
        msecs, err := strconv.Atoi(intervalStr)
        if err != nil {
                return nil, fmt.Errorf("value for option "+netlabel.AdvertiseAddrIntervalMs+" %q must be integer milliseconds", intervalStr)
        }
        interval := time.Duration(msecs) * time.Millisecond
        if interval < osl.AdvertiseAddrIntervalMin || interval > osl.AdvertiseAddrIntervalMax {
                return nil, fmt.Errorf(netlabel.AdvertiseAddrIntervalMs+" must be in the range %d to %d",
                        osl.AdvertiseAddrIntervalMin/time.Millisecond, osl.AdvertiseAddrIntervalMax/time.Millisecond)
        }
        return &interval, nil
}

// IsPruneable returns true if n can be considered for removal as part of a
// "docker network prune" (or system prune). The caller must still check that the
// network should be removed. For example, it may have active endpoints.
func (n *Network) IsPruneable() bool {
        return !network.IsPredefined(n.Name())
}

package networkdb

import (
        "errors"
        "time"

        "github.com/hashicorp/memberlist"
        "github.com/hashicorp/serf/serf"
)

const broadcastTimeout = 5 * time.Second

type networkEventMessage struct {
        id   string
        node string
        msg  []byte
}

func (m *networkEventMessage) Invalidates(other memberlist.Broadcast) bool {
        otherm := other.(*networkEventMessage)
        return m.id == otherm.id && m.node == otherm.node
}

func (m *networkEventMessage) Message() []byte {
        return m.msg
}

func (m *networkEventMessage) Finished() {
}

func (nDB *NetworkDB) sendNetworkEvent(nid string, event NetworkEvent_Type, ltime serf.LamportTime) error {
        nEvent := NetworkEvent{
                Type:      event,
                LTime:     ltime,
                NodeName:  nDB.config.NodeID,
                NetworkID: nid,
        }

        raw, err := encodeMessage(MessageTypeNetworkEvent, &nEvent)
        if err != nil {
                return err
        }

        nDB.networkBroadcasts.QueueBroadcast(&networkEventMessage{
                msg:  raw,
                id:   nid,
                node: nDB.config.NodeID,
        })
        return nil
}

type nodeEventMessage struct {
        msg    []byte
        notify chan<- struct{}
}

func (m *nodeEventMessage) Invalidates(other memberlist.Broadcast) bool {
        return false
}

func (m *nodeEventMessage) Message() []byte {
        return m.msg
}

func (m *nodeEventMessage) Finished() {
        if m.notify != nil {
                close(m.notify)
        }
}

func (nDB *NetworkDB) sendNodeEvent(event NodeEvent_Type) error {
        nEvent := NodeEvent{
                Type:     event,
                LTime:    nDB.networkClock.Increment(),
                NodeName: nDB.config.NodeID,
        }

        raw, err := encodeMessage(MessageTypeNodeEvent, &nEvent)
        if err != nil {
                return err
        }

        notifyCh := make(chan struct{})
        nDB.nodeBroadcasts.QueueBroadcast(&nodeEventMessage{
                msg:    raw,
                notify: notifyCh,
        })

        nDB.RLock()
        noPeers := len(nDB.nodes) <= 1
        nDB.RUnlock()

        // Message enqueued, do not wait for a send if no peer is present
        if noPeers {
                return nil
        }

        // Wait for the broadcast
        select {
        case <-notifyCh:
        case <-time.After(broadcastTimeout):
                return errors.New("timed out broadcasting node event")
        }

        return nil
}

type tableEventMessage struct {
        id    string
        tname string
        key   string
        msg   []byte
}

func (m *tableEventMessage) Invalidates(other memberlist.Broadcast) bool {
        otherm := other.(*tableEventMessage)
        return m.tname == otherm.tname && m.id == otherm.id && m.key == otherm.key
}

func (m *tableEventMessage) Message() []byte {
        return m.msg
}

func (m *tableEventMessage) Finished() {
}

func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname string, key string, entry *entry) error {
        tEvent := TableEvent{
                Type:      event,
                LTime:     entry.ltime,
                NodeName:  nDB.config.NodeID,
                NetworkID: nid,
                TableName: tname,
                Key:       key,
                Value:     entry.value,
                // The duration in second is a float that below would be truncated
                ResidualReapTime: int32(entry.reapTime.Seconds()),
        }

        raw, err := encodeMessage(MessageTypeTableEvent, &tEvent)
        if err != nil {
                return err
        }

        nDB.RLock()
        n, ok := nDB.thisNodeNetworks[nid]
        nDB.RUnlock()

        // The network may have been removed
        if !ok {
                return nil
        }

        n.tableBroadcasts.QueueBroadcast(&tableEventMessage{
                msg:   raw,
                id:    nid,
                tname: tname,
                key:   key,
        })
        return nil
}

func getBroadcasts(overhead, limit int, queues ...*memberlist.TransmitLimitedQueue) [][]byte {
        var msgs [][]byte
        for _, q := range queues {
                b := q.GetBroadcasts(overhead, limit)
                for _, m := range b {
                        limit -= overhead + len(m)
                }
                msgs = append(msgs, b...)
                if limit <= 0 {
                        break
                }
        }
        return msgs
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package networkdb

import (
        "bytes"
        "context"
        "encoding/hex"
        "fmt"
        golog "log"
        rnd "math/rand"
        "net"
        "net/netip"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/hashicorp/memberlist"
)

const (
        reapPeriod       = 5 * time.Second
        retryInterval    = 1 * time.Second
        nodeReapInterval = 24 * time.Hour
        nodeReapPeriod   = 2 * time.Hour
        // considering a cluster with > 20 nodes and a drain speed of 100 msg/s
        // the following is roughly 1 minute
        maxQueueLenBroadcastOnSync = 500
)

type logWriter struct{}

func (l *logWriter) Write(p []byte) (int, error) {
        str := string(p)
        str = strings.TrimSuffix(str, "\n")

        switch {
        case strings.HasPrefix(str, "[WARN] "):
                str = strings.TrimPrefix(str, "[WARN] ")
                log.G(context.TODO()).Warn(str)
        case strings.HasPrefix(str, "[DEBUG] "):
                str = strings.TrimPrefix(str, "[DEBUG] ")
                log.G(context.TODO()).Debug(str)
        case strings.HasPrefix(str, "[INFO] "):
                str = strings.TrimPrefix(str, "[INFO] ")
                log.G(context.TODO()).Info(str)
        case strings.HasPrefix(str, "[ERR] "):
                str = strings.TrimPrefix(str, "[ERR] ")
                log.G(context.TODO()).Warn(str)
        }

        return len(p), nil
}

// SetKey adds a new key to the key ring
func (nDB *NetworkDB) SetKey(key []byte) {
        log.G(context.TODO()).Debugf("Adding key %.5s", hex.EncodeToString(key))
        nDB.Lock()
        defer nDB.Unlock()
        for _, dbKey := range nDB.config.Keys {
                if bytes.Equal(key, dbKey) {
                        return
                }
        }
        nDB.config.Keys = append(nDB.config.Keys, key)
        if nDB.keyring != nil {
                nDB.keyring.AddKey(key)
        }
        logEncKeys(context.TODO(), key)
}

// SetPrimaryKey sets the given key as the primary key. This should have
// been added apriori through SetKey
func (nDB *NetworkDB) SetPrimaryKey(key []byte) {
        log.G(context.TODO()).Debugf("Primary Key %.5s", hex.EncodeToString(key))
        nDB.Lock()
        defer nDB.Unlock()
        for _, dbKey := range nDB.config.Keys {
                if bytes.Equal(key, dbKey) {
                        if nDB.keyring != nil {
                                nDB.keyring.UseKey(dbKey)
                        }
                        break
                }
        }
}

// RemoveKey removes a key from the key ring. The key being removed
// can't be the primary key
func (nDB *NetworkDB) RemoveKey(key []byte) {
        log.G(context.TODO()).Debugf("Remove Key %.5s", hex.EncodeToString(key))
        nDB.Lock()
        defer nDB.Unlock()
        for i, dbKey := range nDB.config.Keys {
                if bytes.Equal(key, dbKey) {
                        nDB.config.Keys = append(nDB.config.Keys[:i], nDB.config.Keys[i+1:]...)
                        if nDB.keyring != nil {
                                nDB.keyring.RemoveKey(dbKey)
                        }
                        break
                }
        }
}

func (nDB *NetworkDB) clusterInit() error {
        nDB.lastStatsTimestamp = time.Now()
        nDB.lastHealthTimestamp = nDB.lastStatsTimestamp

        config := memberlist.DefaultLANConfig()
        config.Name = nDB.config.NodeID
        config.BindAddr = nDB.config.BindAddr
        config.AdvertiseAddr = nDB.config.AdvertiseAddr
        config.UDPBufferSize = nDB.config.PacketBufferSize

        if nDB.config.BindPort != 0 {
                config.BindPort = nDB.config.BindPort
                config.AdvertisePort = nDB.config.BindPort
        }

        config.ProtocolVersion = memberlist.ProtocolVersion2Compatible
        config.Delegate = &delegate{nDB: nDB}
        config.Events = &eventDelegate{nDB: nDB}
        // custom logger that does not add time or date, so they are not
        // duplicated by logrus
        config.Logger = golog.New(&logWriter{}, "", 0)

        var err error
        if len(nDB.config.Keys) > 0 {
                for i, key := range nDB.config.Keys {
                        log.G(context.TODO()).Debugf("Encryption key %d: %.5s", i+1, hex.EncodeToString(key))
                }
                logEncKeys(context.TODO(), nDB.config.Keys...)
                nDB.keyring, err = memberlist.NewKeyring(nDB.config.Keys, nDB.config.Keys[0])
                if err != nil {
                        return err
                }
                config.Keyring = nDB.keyring
        }

        nDB.networkBroadcasts = &memberlist.TransmitLimitedQueue{
                NumNodes:       nDB.estNumNodes,
                RetransmitMult: config.RetransmitMult,
        }

        nDB.nodeBroadcasts = &memberlist.TransmitLimitedQueue{
                NumNodes:       nDB.estNumNodes,
                RetransmitMult: config.RetransmitMult,
        }

        mlist, err := memberlist.Create(config)
        if err != nil {
                return fmt.Errorf("failed to create memberlist: %v", err)
        }

        nDB.ctx, nDB.cancelCtx = context.WithCancel(context.Background())
        nDB.memberlist = mlist

        for _, trigger := range []struct {
                interval time.Duration
                fn       func()
        }{
                {reapPeriod, nDB.reapState},
                {config.GossipInterval, nDB.gossip},
                {config.PushPullInterval, nDB.bulkSyncTables},
                {retryInterval, nDB.reconnectNode},
                {nodeReapPeriod, nDB.reapDeadNode},
                {nDB.config.rejoinClusterInterval, nDB.rejoinClusterBootStrap},
        } {
                t := time.NewTicker(trigger.interval)
                go nDB.triggerFunc(trigger.interval, t.C, trigger.fn)
                nDB.tickers = append(nDB.tickers, t)
        }

        return nil
}

func (nDB *NetworkDB) retryJoin(ctx context.Context, members []string) {
        t := time.NewTicker(retryInterval)
        defer t.Stop()

        for {
                select {
                case <-t.C:
                        if _, err := nDB.memberlist.Join(members); err != nil {
                                log.G(ctx).Errorf("Failed to join memberlist %s on retry: %v", members, err)
                                continue
                        }
                        if err := nDB.sendNodeEvent(NodeEventTypeJoin); err != nil {
                                log.G(ctx).Errorf("failed to send node join on retry: %v", err)
                                continue
                        }
                        return
                case <-ctx.Done():
                        return
                }
        }
}

func (nDB *NetworkDB) clusterJoin(members []string) error {
        mlist := nDB.memberlist

        if _, err := mlist.Join(members); err != nil {
                // In case of failure, we no longer need to explicitly call retryJoin.
                // rejoinClusterBootStrap, which runs every nDB.config.rejoinClusterInterval,
                // will retryJoin for nDB.config.rejoinClusterDuration.
                return fmt.Errorf("could not join node to memberlist: %v", err)
        }

        if err := nDB.sendNodeEvent(NodeEventTypeJoin); err != nil {
                return fmt.Errorf("failed to send node join: %v", err)
        }

        return nil
}

func (nDB *NetworkDB) clusterLeave() error {
        mlist := nDB.memberlist

        if err := nDB.sendNodeEvent(NodeEventTypeLeave); err != nil {
                log.G(context.TODO()).WithError(err).Error("failed to send node leave event")
        }

        if err := mlist.Leave(time.Second); err != nil {
                log.G(context.TODO()).WithError(err).Error("failed to broadcast memberlist leave message")
        }

        // cancel the context
        nDB.cancelCtx()

        for _, t := range nDB.tickers {
                t.Stop()
        }

        return mlist.Shutdown()
}

func (nDB *NetworkDB) triggerFunc(stagger time.Duration, C <-chan time.Time, f func()) {
        // Use a random stagger to avoid synchronizing
        randStagger := time.Duration(uint64(rnd.Int63()) % uint64(stagger)) //nolint:gosec // gosec complains about the use of rand here. It should be fine.
        select {
        case <-time.After(randStagger):
        case <-nDB.ctx.Done():
                return
        }
        for {
                select {
                case <-C:
                        f()
                case <-nDB.ctx.Done():
                        return
                }
        }
}

func (nDB *NetworkDB) reapDeadNode() {
        nDB.Lock()
        defer nDB.Unlock()
        for _, nodeMap := range []map[string]*node{
                nDB.failedNodes,
                nDB.leftNodes,
        } {
                for id, n := range nodeMap {
                        if n.reapTime > nodeReapPeriod {
                                n.reapTime -= nodeReapPeriod
                                continue
                        }
                        log.G(context.TODO()).Debugf("Garbage collect node %v", n.Name)
                        delete(nodeMap, id)
                }
        }
}

// rejoinClusterBootStrap is called periodically to check if all bootStrap nodes are active in the cluster,
// if not, call the cluster join to merge 2 separate clusters that are formed when all managers
// stopped/started at the same time
func (nDB *NetworkDB) rejoinClusterBootStrap() {
        nDB.RLock()
        if len(nDB.bootStrapIP) == 0 {
                nDB.RUnlock()
                return
        }

        myself, ok := nDB.nodes[nDB.config.NodeID]
        if !ok {
                nDB.RUnlock()
                log.G(context.TODO()).Warnf("rejoinClusterBootstrap unable to find local node info using ID:%v", nDB.config.NodeID)
                return
        }
        bootStrapIPs := make([]string, 0, len(nDB.bootStrapIP))
        for _, bootIP := range nDB.bootStrapIP {
                // bootstrap IPs are usually IP:port from the Join
                bootstrapIP, err := netip.ParseAddrPort(bootIP)
                if err != nil {
                        // try to parse it as an IP without port
                        // Note this seems to be the case for swarm that do not specify any port
                        addr, err := netip.ParseAddr(bootIP)
                        if err == nil {
                                bootstrapIP = netip.AddrPortFrom(addr, uint16(nDB.config.BindPort))
                        }
                }
                if bootstrapIP.IsValid() {
                        for _, node := range nDB.nodes {
                                if node == myself {
                                        continue
                                }
                                nodeIP, _ := netip.AddrFromSlice(node.Addr)
                                if bootstrapIP == netip.AddrPortFrom(nodeIP, node.Port) {
                                        // One of the bootstrap nodes (and not myself) is part of the cluster, return
                                        nDB.RUnlock()
                                        return
                                }
                        }
                        bootStrapIPs = append(bootStrapIPs, bootIP)
                }
        }
        nDB.RUnlock()
        if len(bootStrapIPs) == 0 {
                // this will also avoid to call the Join with an empty list erasing the current bootstrap ip list
                log.G(context.TODO()).Debug("rejoinClusterBootStrap did not find any valid IP")
                return
        }
        // None of the bootStrap nodes are in the cluster, call memberlist join
        log.G(context.TODO()).Debugf("rejoinClusterBootStrap, calling cluster join with bootStrap %v", bootStrapIPs)
        ctx, cancel := context.WithTimeout(nDB.ctx, nDB.config.rejoinClusterDuration)
        defer cancel()
        nDB.retryJoin(ctx, bootStrapIPs)
}

func (nDB *NetworkDB) reconnectNode() {
        nDB.RLock()
        if len(nDB.failedNodes) == 0 {
                nDB.RUnlock()
                return
        }

        nodes := make([]*node, 0, len(nDB.failedNodes))
        for _, n := range nDB.failedNodes {
                nodes = append(nodes, n)
        }
        nDB.RUnlock()

        nDB.rngMu.Lock()
        offset := nDB.rng.IntN(len(nodes))
        nDB.rngMu.Unlock()
        node := nodes[offset]
        addr := net.UDPAddr{IP: node.Addr, Port: int(node.Port)}

        if _, err := nDB.memberlist.Join([]string{addr.String()}); err != nil {
                return
        }

        if err := nDB.sendNodeEvent(NodeEventTypeJoin); err != nil {
                return
        }

        log.G(context.TODO()).Debugf("Initiating bulk sync with node %s after reconnect", node.Name)
        nDB.bulkSync([]string{node.Name}, true)
}

// For timing the entry deletion in the reaper APIs that doesn't use monotonic clock
// source (time.Now, Sub etc.) should be avoided. Hence we use reapTime in every
// entry which is set initially to reapInterval and decremented by reapPeriod every time
// the reaper runs. NOTE nDB.reapTableEntries updates the reapTime with a readlock. This
// is safe as long as no other concurrent path touches the reapTime field.
func (nDB *NetworkDB) reapState() {
        // The reapTableEntries leverage the presence of the network so garbage collect entries first
        nDB.reapTableEntries()
        nDB.reapNetworks()
}

func (nDB *NetworkDB) reapNetworks() {
        nDB.Lock()
        for id, n := range nDB.thisNodeNetworks {
                if n.leaving {
                        if n.reapTime <= 0 {
                                delete(nDB.thisNodeNetworks, id)
                                continue
                        }
                        n.reapTime -= reapPeriod
                }
        }
        for _, nn := range nDB.networks {
                for id, n := range nn {
                        if n.leaving {
                                if n.reapTime <= 0 {
                                        delete(nn, id)
                                        continue
                                }
                                n.reapTime -= reapPeriod
                        }
                }
        }
        nDB.Unlock()
}

func (nDB *NetworkDB) reapTableEntries() {
        var nodeNetworks []string
        // This is best effort, if the list of network changes will be picked up in the next cycle
        nDB.RLock()
        for nid := range nDB.thisNodeNetworks {
                nodeNetworks = append(nodeNetworks, nid)
        }
        nDB.RUnlock()

        cycleStart := time.Now()
        // In order to avoid blocking the database for a long time, apply the garbage collection logic by network
        // The lock is taken at the beginning of the cycle and the deletion is inline
        for _, nid := range nodeNetworks {
                nDB.Lock()
                nDB.indexes[byNetwork].Root().WalkPrefix([]byte("/"+nid), func(path []byte, v *entry) bool {
                        // timeCompensation compensate in case the lock took some time to be released
                        timeCompensation := time.Since(cycleStart)
                        if !v.deleting {
                                return false
                        }

                        // In this check we are adding an extra 1 second to guarantee that when the number is truncated to int32 to fit the packet
                        // for the tableEvent the number is always strictly > 1 and never 0
                        if v.reapTime > reapPeriod+timeCompensation+time.Second {
                                v.reapTime -= reapPeriod + timeCompensation
                                return false
                        }

                        params := strings.Split(string(path[1:]), "/")
                        nwID, tName, key := params[0], params[1], params[2]
                        okTable, okNetwork := nDB.deleteEntry(nwID, tName, key)
                        if !okTable {
                                log.G(context.TODO()).Errorf("Table tree delete failed, entry with key:%s does not exist in the table:%s network:%s", key, tName, nwID)
                        }
                        if !okNetwork {
                                log.G(context.TODO()).Errorf("Network tree delete failed, entry with key:%s does not exist in the network:%s table:%s", key, nwID, tName)
                        }

                        return false
                })
                nDB.Unlock()
        }
}

func (nDB *NetworkDB) gossip() {
        networkNodes := make(map[string][]string)
        nDB.RLock()
        for nid := range nDB.thisNodeNetworks {
                networkNodes[nid] = nDB.networkNodes[nid]
        }
        printStats := time.Since(nDB.lastStatsTimestamp) >= nDB.config.StatsPrintPeriod
        printHealth := time.Since(nDB.lastHealthTimestamp) >= nDB.config.HealthPrintPeriod
        nDB.RUnlock()

        if printHealth {
                healthScore := nDB.memberlist.GetHealthScore()
                if healthScore != 0 {
                        log.G(context.TODO()).Warnf("NetworkDB stats %v(%v) - healthscore:%d (connectivity issues)", nDB.config.Hostname, nDB.config.NodeID, healthScore)
                }
                nDB.lastHealthTimestamp = time.Now()
        }

        for nid, nodes := range networkNodes {
                mNodes := nDB.mRandomNodes(3, nodes)
                bytesAvail := nDB.config.PacketBufferSize - compoundHeaderOverhead

                nDB.RLock()
                network, ok := nDB.thisNodeNetworks[nid]
                nDB.RUnlock()
                if !ok || network == nil {
                        // It is normal for the network to be removed
                        // between the time we collect the network
                        // attachments of this node and processing
                        // them here.
                        continue
                }

                msgs := getBroadcasts(compoundOverhead, bytesAvail, network.tableBroadcasts, network.tableRebroadcasts)
                // Collect stats and print the queue info, note this code is here also to have a view of the queues empty
                network.qMessagesSent.Add(int64(len(msgs)))
                if printStats {
                        msent := network.qMessagesSent.Swap(0)
                        log.G(context.TODO()).Infof("NetworkDB stats %v(%v) - netID:%s leaving:%t netPeers:%d entries:%d Queue qLen:%d+%d netMsg/s:%d",
                                nDB.config.Hostname, nDB.config.NodeID,
                                nid, network.leaving, network.tableBroadcasts.NumNodes(), network.entriesNumber.Load(),
                                network.tableBroadcasts.NumQueued(), network.tableRebroadcasts.NumQueued(),
                                msent/int64((nDB.config.StatsPrintPeriod/time.Second)))
                }

                if len(msgs) == 0 {
                        continue
                }

                // Create a compound message
                compound := makeCompoundMessage(msgs)

                for _, node := range mNodes {
                        nDB.RLock()
                        mnode := nDB.nodes[node]
                        nDB.RUnlock()

                        if mnode == nil {
                                break
                        }

                        // Send the compound message
                        if err := nDB.memberlist.SendBestEffort(&mnode.Node, compound); err != nil {
                                log.G(context.TODO()).Errorf("Failed to send gossip to %s: %s", mnode.Addr, err)
                        }
                }
        }
        // Reset the stats
        if printStats {
                nDB.lastStatsTimestamp = time.Now()
        }
}

func (nDB *NetworkDB) bulkSyncTables() {
        var networks []string
        nDB.RLock()
        for nid, network := range nDB.thisNodeNetworks {
                if network.leaving {
                        continue
                }
                networks = append(networks, nid)
        }
        nDB.RUnlock()

        for len(networks) != 0 {
                nid := networks[0]
                networks = networks[1:]

                nDB.RLock()
                nodes := nDB.networkNodes[nid]
                nDB.RUnlock()

                // No peer nodes on this network. Move on.
                if len(nodes) == 0 {
                        continue
                }

                completed, err := nDB.bulkSync(nodes, false)
                if err != nil {
                        log.G(context.TODO()).Errorf("periodic bulk sync failure for network %s: %v", nid, err)
                        continue
                }

                // Remove all the networks for which we have
                // successfully completed bulk sync in this iteration.
                updatedNetworks := make([]string, 0, len(networks))
                for _, nid := range networks {
                        var found bool
                        for _, completedNid := range completed {
                                if nid == completedNid {
                                        found = true
                                        break
                                }
                        }

                        if !found {
                                updatedNetworks = append(updatedNetworks, nid)
                        }
                }

                networks = updatedNetworks
        }
}

func (nDB *NetworkDB) bulkSync(nodes []string, all bool) ([]string, error) {
        if !all {
                // Get 2 random nodes. 2nd node will be tried if the bulk sync to
                // 1st node fails.
                nodes = nDB.mRandomNodes(2, nodes)
        }

        if len(nodes) == 0 {
                return nil, nil
        }

        var err error
        var networks []string
        var success bool
        for _, node := range nodes {
                if node == nDB.config.NodeID {
                        continue
                }
                log.G(context.TODO()).Debugf("%v(%v): Initiating bulk sync with node %v", nDB.config.Hostname, nDB.config.NodeID, node)
                networks = nDB.findCommonNetworks(node)
                err = nDB.bulkSyncNode(networks, node, true)
                if err != nil {
                        err = fmt.Errorf("bulk sync to node %s failed: %v", node, err)
                        log.G(context.TODO()).Warn(err.Error())
                } else {
                        // bulk sync succeeded
                        success = true
                        // if its periodic bulksync stop after the first successful sync
                        if !all {
                                break
                        }
                }
        }

        if success {
                // if at least one node sync succeeded
                return networks, nil
        }

        return nil, err
}

// Bulk sync all the table entries belonging to a set of networks to a
// single peer node. It can be unsolicited or can be in response to an
// unsolicited bulk sync
func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited bool) error {
        var msgs [][]byte

        var unsolMsg string
        if unsolicited {
                unsolMsg = "unsolicited"
        }

        log.G(context.TODO()).Debugf("%v(%v): Initiating %s bulk sync for networks %v with node %s",
                nDB.config.Hostname, nDB.config.NodeID, unsolMsg, networks, node)

        nDB.RLock()
        mnode := nDB.nodes[node]
        if mnode == nil {
                nDB.RUnlock()
                return nil
        }

        for _, nid := range networks {
                nDB.indexes[byNetwork].Root().WalkPrefix([]byte("/"+nid), func(path []byte, v *entry) bool {
                        eType := TableEventTypeCreate
                        if v.deleting {
                                eType = TableEventTypeDelete
                        }

                        params := strings.Split(string(path[1:]), "/")
                        tEvent := TableEvent{
                                Type:      eType,
                                LTime:     v.ltime,
                                NodeName:  v.node,
                                NetworkID: nid,
                                TableName: params[1],
                                Key:       params[2],
                                Value:     v.value,
                                // The duration in second is a float that below would be truncated
                                ResidualReapTime: int32(v.reapTime.Seconds()),
                        }

                        msg, err := encodeMessage(MessageTypeTableEvent, &tEvent)
                        if err != nil {
                                log.G(context.TODO()).Errorf("Encode failure during bulk sync: %#v", tEvent)
                                return false
                        }

                        msgs = append(msgs, msg)
                        return false
                })
        }
        nDB.RUnlock()

        // Create a compound message
        compound := makeCompoundMessage(msgs)

        bsm := BulkSyncMessage{
                LTime:       nDB.tableClock.Time(),
                Unsolicited: unsolicited,
                NodeName:    nDB.config.NodeID,
                Networks:    networks,
                Payload:     compound,
        }

        buf, err := encodeMessage(MessageTypeBulkSync, &bsm)
        if err != nil {
                return fmt.Errorf("failed to encode bulk sync message: %v", err)
        }

        nDB.Lock()
        ch := make(chan struct{})
        nDB.bulkSyncAckTbl[node] = ch
        nDB.Unlock()

        err = nDB.memberlist.SendReliable(&mnode.Node, buf)
        if err != nil {
                nDB.Lock()
                delete(nDB.bulkSyncAckTbl, node)
                nDB.Unlock()

                return fmt.Errorf("failed to send a TCP message during bulk sync: %v", err)
        }

        // Wait on a response only if it is unsolicited.
        if unsolicited {
                startTime := time.Now()
                t := time.NewTimer(30 * time.Second)
                select {
                case <-t.C:
                        log.G(context.TODO()).Errorf("Bulk sync to node %s timed out", node)
                case <-ch:
                        log.G(context.TODO()).Debugf("%v(%v): Bulk sync to node %s took %s", nDB.config.Hostname, nDB.config.NodeID, node, time.Since(startTime))
                }
                t.Stop()
        }

        return nil
}

// mRandomNodes is used to select up to m random nodes. It is possible
// that less than m nodes are returned.
func (nDB *NetworkDB) mRandomNodes(m int, nodes []string) []string {
        mNodes := make([]string, 0, max(0, len(nodes)-1))
        for _, node := range nodes {
                if node == nDB.config.NodeID {
                        // Skip myself
                        continue
                }
                mNodes = append(mNodes, node)
        }

        if len(mNodes) < m {
                nDB.rngMu.Lock()
                nDB.rng.Shuffle(len(mNodes), func(i, j int) {
                        mNodes[i], mNodes[j] = mNodes[j], mNodes[i]
                })
                nDB.rngMu.Unlock()
                return mNodes
        }

        nDB.rngMu.Lock()
        perm := nDB.rng.Perm(len(mNodes))
        nDB.rngMu.Unlock()

        sample := make([]string, 0, m)
        for _, idx := range perm[:m] {
                sample = append(sample, mNodes[idx])
        }

        return sample
}

package networkdb

import (
        "context"
        "encoding/hex"
        "fmt"
        "os"
        "strings"

        "github.com/containerd/log"
)

func logEncKeys(ctx context.Context, keys ...[]byte) {
        klpath := os.Getenv("NETWORKDBKEYLOGFILE")
        if klpath == "" {
                return
        }

        die := func(err error) {
                log.G(ctx).WithFields(log.Fields{
                        "error": err,
                        "path":  klpath,
                }).Error("could not write to NetworkDB encryption-key log")
        }
        f, err := os.OpenFile(klpath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600)
        if err != nil {
                die(err)
                return
        }
        defer func() {
                if err := f.Close(); err != nil {
                        die(err)
                }
        }()

        tohex := hex.NewEncoder(f)
        for _, key := range keys {
                if _, err := tohex.Write(key); err != nil {
                        die(err)
                        return
                }
                if _, err := f.WriteString("\n"); err != nil {
                        die(err)
                        return
                }
        }
}

func (nDB *NetworkDB) DebugDumpTable(tname string) string {
        nDB.RLock()
        root := nDB.indexes[byTable].Root()
        nDB.RUnlock()
        var sb strings.Builder
        root.WalkPrefix([]byte("/"+tname), func(path []byte, v *entry) bool {
                fmt.Fprintf(&sb, "    %q: %+v\n", path, v)
                return false
        })
        return sb.String()
}

package networkdb

import (
        "context"
        "net"
        "time"

        "github.com/containerd/log"
        "github.com/gogo/protobuf/proto"
)

type delegate struct {
        nDB *NetworkDB
}

func (d *delegate) NodeMeta(limit int) []byte {
        return []byte{}
}

func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
        // Update our local clock if the received messages has newer
        // time.
        nDB.networkClock.Witness(nEvent.LTime)

        nDB.Lock()
        defer nDB.Unlock()

        // check if the node exists
        n, _, _ := nDB.findNode(nEvent.NodeName)
        if n == nil {
                return false
        }

        // check if the event is fresh
        if n.ltime >= nEvent.LTime {
                return false
        }

        // If we are here means that the event is fresher and the node is known. Update the laport time
        n.ltime = nEvent.LTime

        // If the node is not known from memberlist we cannot process save any state of it else if it actually
        // dies we won't receive any notification and we will remain stuck with it
        if _, ok := nDB.nodes[nEvent.NodeName]; !ok {
                log.G(context.TODO()).Errorf("node: %s is unknown to memberlist", nEvent.NodeName)
                return false
        }

        switch nEvent.Type {
        case NodeEventTypeJoin:
                moved, err := nDB.changeNodeState(n.Name, nodeActiveState)
                if err != nil {
                        log.G(context.TODO()).WithError(err).Error("unable to find the node to move")
                        return false
                }
                if moved {
                        log.G(context.TODO()).Infof("%v(%v): Node join event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
                }
                return moved
        case NodeEventTypeLeave:
                moved, err := nDB.changeNodeState(n.Name, nodeLeftState)
                if err != nil {
                        log.G(context.TODO()).WithError(err).Error("unable to find the node to move")
                        return false
                }
                if moved {
                        log.G(context.TODO()).Infof("%v(%v): Node leave event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
                }
                return moved
        default:
                // TODO(thaJeztah): make switch exhaustive; add networkdb.NodeEventTypeInvalid
                return false
        }
}

func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
        // Update our local clock if the received messages has newer
        // time.
        nDB.networkClock.Witness(nEvent.LTime)

        nDB.Lock()
        defer nDB.Unlock()

        if nEvent.NodeName == nDB.config.NodeID {
                return false
        }

        nodeNetworks, ok := nDB.networks[nEvent.NodeName]
        if !ok {
                // We haven't heard about this node at all.  Ignore the leave
                if nEvent.Type == NetworkEventTypeLeave {
                        return false
                }

                nodeNetworks = make(map[string]*network)
                nDB.networks[nEvent.NodeName] = nodeNetworks
        }

        if n, ok := nodeNetworks[nEvent.NetworkID]; ok {
                // We have the latest state. Ignore the event
                // since it is stale.
                if n.ltime >= nEvent.LTime {
                        return false
                }

                n.ltime = nEvent.LTime
                n.leaving = nEvent.Type == NetworkEventTypeLeave
                if n.leaving {
                        n.reapTime = nDB.config.reapNetworkInterval

                        // The remote node is leaving the network, but not the gossip cluster.
                        // Delete all the entries for this network owned by the node.
                        nDB.deleteNodeNetworkEntries(nEvent.NetworkID, nEvent.NodeName)
                }

                if nEvent.Type == NetworkEventTypeLeave {
                        nDB.deleteNetworkNode(nEvent.NetworkID, nEvent.NodeName)
                } else {
                        nDB.addNetworkNode(nEvent.NetworkID, nEvent.NodeName)
                }

                return true
        }

        if nEvent.Type == NetworkEventTypeLeave {
                return false
        }

        // If the node is not known from memberlist we cannot process save any state of it else if it actually
        // dies we won't receive any notification and we will remain stuck with it
        if _, ok := nDB.nodes[nEvent.NodeName]; !ok {
                return false
        }

        // This remote network join is being seen the first time.
        nodeNetworks[nEvent.NetworkID] = &network{ltime: nEvent.LTime}

        nDB.addNetworkNode(nEvent.NetworkID, nEvent.NodeName)
        return true
}

func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent, isBulkSync bool) bool {
        // Update our local clock if the received messages has newer time.
        nDB.tableClock.Witness(tEvent.LTime)

        nDB.Lock()
        // Hold the lock until after we broadcast the event to watchers so that
        // the new watch receives either the synthesized event or the event we
        // broadcast, never both.
        defer nDB.Unlock()

        // Ignore the table events for networks that are in the process of going away
        network, ok := nDB.thisNodeNetworks[tEvent.NetworkID]
        // Check if the owner of the event is still part of the network
        nodes := nDB.networkNodes[tEvent.NetworkID]
        var nodePresent bool
        for _, node := range nodes {
                if node == tEvent.NodeName {
                        nodePresent = true
                        break
                }
        }

        if !ok || network.leaving || !nodePresent {
                // I'm out of the network OR the event owner is not anymore part of the network so do not propagate
                return false
        }

        var entryPresent bool
        prev, err := nDB.getEntry(tEvent.TableName, tEvent.NetworkID, tEvent.Key)
        if err == nil {
                entryPresent = true
                // We have the latest state. Ignore the event
                // since it is stale.
                if prev.ltime >= tEvent.LTime {
                        return false
                }
        }

        e := &entry{
                ltime:    tEvent.LTime,
                node:     tEvent.NodeName,
                value:    tEvent.Value,
                deleting: tEvent.Type == TableEventTypeDelete,
                reapTime: time.Duration(tEvent.ResidualReapTime) * time.Second,
        }

        // All the entries marked for deletion should have a reapTime set greater than 0
        // This case can happen if the cluster is running different versions of the engine where the old version does not have the
        // field. If that is not the case, this can be a BUG
        if e.deleting && e.reapTime == 0 {
                log.G(context.TODO()).Warnf("%v(%v) handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?",
                        nDB.config.Hostname, nDB.config.NodeID, tEvent)
                e.reapTime = nDB.config.reapEntryInterval
        }
        nDB.createOrUpdateEntry(tEvent.NetworkID, tEvent.TableName, tEvent.Key, e)

        if !entryPresent && tEvent.Type == TableEventTypeDelete {
                // We will rebroadcast the message for an unknown entry if all the conditions are met:
                // 1) the message was received from a bulk sync
                // 2) we had already synced this network (during the network join)
                // 3) the residual reapTime is higher than 1/6 of the total reapTime.
                //
                // If the residual reapTime is lower or equal to 1/6 of the total reapTime
                // don't bother broadcasting it around as most likely the cluster is already aware of it.
                // This also reduces the possibility that deletion of entries close to their garbage collection
                // ends up circling around forever.
                //
                // The safest approach is to not rebroadcast async messages for unknown entries.
                // It is possible that the queue grew so much to exceed the garbage collection time
                // (the residual reap time that is in the message is not being updated, to avoid
                // inserting too many messages in the queue).

                // log.G(ctx).Infof("exiting on delete not knowing the obj with rebroadcast:%t", network.inSync)
                return isBulkSync && network.inSync && e.reapTime > nDB.config.reapEntryInterval/6
        }

        var op opType
        value := tEvent.Value
        switch tEvent.Type {
        case TableEventTypeCreate, TableEventTypeUpdate:
                // Gossip messages could arrive out-of-order so it is possible
                // for an entry's UPDATE event to be received before its CREATE
                // event. The local watchers should not need to care about such
                // nuances. Broadcast events to watchers based only on what
                // changed in the local NetworkDB state.
                op = opCreate
                if entryPresent && !prev.deleting {
                        op = opUpdate
                }
        case TableEventTypeDelete:
                if !entryPresent || prev.deleting {
                        goto SkipBroadcast
                }
                op = opDelete
                // Broadcast the value most recently observed by watchers,
                // which may be different from the value in the DELETE event
                // (e.g. if the DELETE event was received out-of-order).
                value = prev.value
        default:
                // TODO(thaJeztah): make switch exhaustive; add networkdb.TableEventTypeInvalid
        }

        nDB.broadcaster.Write(makeEvent(op, tEvent.TableName, tEvent.NetworkID, tEvent.Key, value))
SkipBroadcast:
        return network.inSync
}

func (nDB *NetworkDB) handleCompound(buf []byte, isBulkSync bool) {
        // Decode the parts
        parts, err := decodeCompoundMessage(buf)
        if err != nil {
                log.G(context.TODO()).Errorf("Failed to decode compound request: %v", err)
                return
        }

        // Handle each message
        for _, part := range parts {
                nDB.handleMessage(part, isBulkSync)
        }
}

func (nDB *NetworkDB) handleTableMessage(buf []byte, isBulkSync bool) {
        var tEvent TableEvent
        if err := proto.Unmarshal(buf, &tEvent); err != nil {
                log.G(context.TODO()).Errorf("Error decoding table event message: %v", err)
                return
        }

        // Ignore messages that this node generated.
        if tEvent.NodeName == nDB.config.NodeID {
                return
        }

        if rebroadcast := nDB.handleTableEvent(&tEvent, isBulkSync); rebroadcast {
                var err error
                buf, err = encodeRawMessage(MessageTypeTableEvent, buf)
                if err != nil {
                        log.G(context.TODO()).Errorf("Error marshalling gossip message for network event rebroadcast: %v", err)
                        return
                }

                nDB.RLock()
                n, ok := nDB.thisNodeNetworks[tEvent.NetworkID]
                nDB.RUnlock()

                // if the network is not there anymore, OR we are leaving the network
                if !ok || n.leaving {
                        return
                }

                // if the queue is over the threshold, avoid distributing information coming from TCP sync
                if isBulkSync && n.tableRebroadcasts.NumQueued() > maxQueueLenBroadcastOnSync {
                        return
                }

                n.tableRebroadcasts.QueueBroadcast(&tableEventMessage{
                        msg:   buf,
                        id:    tEvent.NetworkID,
                        tname: tEvent.TableName,
                        key:   tEvent.Key,
                })
        }
}

func (nDB *NetworkDB) handleNodeMessage(buf []byte) {
        var nEvent NodeEvent
        if err := proto.Unmarshal(buf, &nEvent); err != nil {
                log.G(context.TODO()).Errorf("Error decoding node event message: %v", err)
                return
        }

        if rebroadcast := nDB.handleNodeEvent(&nEvent); rebroadcast {
                var err error
                buf, err = encodeRawMessage(MessageTypeNodeEvent, buf)
                if err != nil {
                        log.G(context.TODO()).Errorf("Error marshalling gossip message for node event rebroadcast: %v", err)
                        return
                }

                nDB.nodeBroadcasts.QueueBroadcast(&nodeEventMessage{
                        msg: buf,
                })
        }
}

func (nDB *NetworkDB) handleNetworkMessage(buf []byte) {
        var nEvent NetworkEvent
        if err := proto.Unmarshal(buf, &nEvent); err != nil {
                log.G(context.TODO()).Errorf("Error decoding network event message: %v", err)
                return
        }

        if rebroadcast := nDB.handleNetworkEvent(&nEvent); rebroadcast {
                var err error
                buf, err = encodeRawMessage(MessageTypeNetworkEvent, buf)
                if err != nil {
                        log.G(context.TODO()).Errorf("Error marshalling gossip message for network event rebroadcast: %v", err)
                        return
                }

                nDB.networkBroadcasts.QueueBroadcast(&networkEventMessage{
                        msg:  buf,
                        id:   nEvent.NetworkID,
                        node: nEvent.NodeName,
                })
        }
}

func (nDB *NetworkDB) handleBulkSync(buf []byte) {
        var bsm BulkSyncMessage
        if err := proto.Unmarshal(buf, &bsm); err != nil {
                log.G(context.TODO()).Errorf("Error decoding bulk sync message: %v", err)
                return
        }

        if bsm.LTime > 0 {
                nDB.tableClock.Witness(bsm.LTime)
        }

        nDB.handleMessage(bsm.Payload, true)

        // Don't respond to a bulk sync which was not unsolicited
        if !bsm.Unsolicited {
                nDB.Lock()
                ch, ok := nDB.bulkSyncAckTbl[bsm.NodeName]
                if ok {
                        close(ch)
                        delete(nDB.bulkSyncAckTbl, bsm.NodeName)
                }
                nDB.Unlock()

                return
        }

        var nodeAddr net.IP
        nDB.RLock()
        if node, ok := nDB.nodes[bsm.NodeName]; ok {
                nodeAddr = node.Addr
        }
        nDB.RUnlock()

        if err := nDB.bulkSyncNode(bsm.Networks, bsm.NodeName, false); err != nil {
                log.G(context.TODO()).Errorf("Error in responding to bulk sync from node %s: %v", nodeAddr, err)
        }
}

func (nDB *NetworkDB) handleMessage(buf []byte, isBulkSync bool) {
        mType, data, err := decodeMessage(buf)
        if err != nil {
                log.G(context.TODO()).Errorf("Error decoding gossip message to get message type: %v", err)
                return
        }

        switch mType {
        case MessageTypeNodeEvent:
                nDB.handleNodeMessage(data)
        case MessageTypeNetworkEvent:
                nDB.handleNetworkMessage(data)
        case MessageTypeTableEvent:
                nDB.handleTableMessage(data, isBulkSync)
        case MessageTypeBulkSync:
                nDB.handleBulkSync(data)
        case MessageTypeCompound:
                nDB.handleCompound(data, isBulkSync)
        default:
                log.G(context.TODO()).Errorf("%v(%v): unknown message type %d", nDB.config.Hostname, nDB.config.NodeID, mType)
        }
}

func (d *delegate) NotifyMsg(buf []byte) {
        if len(buf) == 0 {
                return
        }

        d.nDB.handleMessage(buf, false)
}

func (d *delegate) GetBroadcasts(overhead, limit int) [][]byte {
        return getBroadcasts(overhead, limit, d.nDB.networkBroadcasts, d.nDB.nodeBroadcasts)
}

func (d *delegate) LocalState(join bool) []byte {
        if join {
                // Update all the local node/network state to a new time to
                // force update on the node we are trying to rejoin, just in
                // case that node has these in leaving state still. This is
                // facilitate fast convergence after recovering from a gossip
                // failure.
                d.nDB.updateLocalNetworkTime()
        }

        d.nDB.RLock()
        defer d.nDB.RUnlock()

        pp := NetworkPushPull{
                LTime:    d.nDB.networkClock.Time(),
                NodeName: d.nDB.config.NodeID,
        }

        for nid, n := range d.nDB.thisNodeNetworks {
                pp.Networks = append(pp.Networks, &NetworkEntry{
                        LTime:     n.ltime,
                        NetworkID: nid,
                        NodeName:  d.nDB.config.NodeID,
                        Leaving:   n.leaving,
                })
        }
        for name, nn := range d.nDB.networks {
                for nid, n := range nn {
                        pp.Networks = append(pp.Networks, &NetworkEntry{
                                LTime:     n.ltime,
                                NetworkID: nid,
                                NodeName:  name,
                                Leaving:   n.leaving,
                        })
                }
        }

        buf, err := encodeMessage(MessageTypePushPull, &pp)
        if err != nil {
                log.G(context.TODO()).Errorf("Failed to encode local network state: %v", err)
                return nil
        }

        return buf
}

func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) {
        if len(buf) == 0 {
                log.G(context.TODO()).Error("zero byte remote network state received")
                return
        }

        var gMsg GossipMessage
        err := proto.Unmarshal(buf, &gMsg)
        if err != nil {
                log.G(context.TODO()).Errorf("Error unmarshalling push pull message: %v", err)
                return
        }

        if gMsg.Type != MessageTypePushPull {
                log.G(context.TODO()).Errorf("Invalid message type %v received from remote", buf[0])
        }

        pp := NetworkPushPull{}
        if err := proto.Unmarshal(gMsg.Data, &pp); err != nil {
                log.G(context.TODO()).Errorf("Failed to decode remote network state: %v", err)
                return
        }

        nodeEvent := &NodeEvent{
                LTime:    pp.LTime,
                NodeName: pp.NodeName,
                Type:     NodeEventTypeJoin,
        }
        d.nDB.handleNodeEvent(nodeEvent)

        for _, n := range pp.Networks {
                nEvent := &NetworkEvent{
                        LTime:     n.LTime,
                        NodeName:  n.NodeName,
                        NetworkID: n.NetworkID,
                        Type:      NetworkEventTypeJoin,
                }

                if n.Leaving {
                        nEvent.Type = NetworkEventTypeLeave
                }

                d.nDB.handleNetworkEvent(nEvent)
        }
}

package networkdb

import (
        "context"
        "encoding/json"
        "net"

        "github.com/containerd/log"
        "github.com/hashicorp/memberlist"
)

type eventDelegate struct {
        nDB *NetworkDB
}

func (e *eventDelegate) broadcastNodeEvent(addr net.IP, op opType) {
        value, err := json.Marshal(&NodeAddr{addr})
        if err == nil {
                e.nDB.broadcaster.Write(makeEvent(op, NodeTable, "", "", value))
        } else {
                log.G(context.TODO()).Errorf("Error marshalling node broadcast event %s", addr.String())
        }
}

func (e *eventDelegate) NotifyJoin(mn *memberlist.Node) {
        log.G(context.TODO()).Infof("Node %s/%s, joined gossip cluster", mn.Name, mn.Addr)
        e.broadcastNodeEvent(mn.Addr, opCreate)
        e.nDB.Lock()
        defer e.nDB.Unlock()

        // In case the node is rejoining after a failure or leave,
        // just add the node back to active
        if moved, _ := e.nDB.changeNodeState(mn.Name, nodeActiveState); moved {
                return
        }

        // Every node has a unique ID
        // Check on the base of the IP address if the new node that joined is actually a new incarnation of a previous
        // failed or shutdown one
        e.nDB.purgeReincarnation(mn)

        e.nDB.nodes[mn.Name] = &node{Node: *mn}
        e.nDB.estNodes.Store(int32(len(e.nDB.nodes)))
        log.G(context.TODO()).Infof("Node %s/%s, added to nodes list", mn.Name, mn.Addr)
}

func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) {
        log.G(context.TODO()).Infof("Node %s/%s, left gossip cluster", mn.Name, mn.Addr)
        e.broadcastNodeEvent(mn.Addr, opDelete)

        e.nDB.Lock()
        defer e.nDB.Unlock()

        n, currState, _ := e.nDB.findNode(mn.Name)
        if n == nil {
                log.G(context.TODO()).Errorf("Node %s/%s not found in the node lists", mn.Name, mn.Addr)
                return
        }
        // if the node was active means that did not send the leave cluster message, so it's probable that
        // failed. Else would be already in the left list so nothing else has to be done
        if currState == nodeActiveState {
                moved, err := e.nDB.changeNodeState(mn.Name, nodeFailedState)
                if err != nil {
                        log.G(context.TODO()).WithError(err).Errorf("impossible condition, node %s/%s not present in the list", mn.Name, mn.Addr)
                        return
                }
                if moved {
                        log.G(context.TODO()).Infof("Node %s/%s, added to failed nodes list", mn.Name, mn.Addr)
                }
        }
}

func (e *eventDelegate) NotifyUpdate(n *memberlist.Node) {
}

package networkdb

import "github.com/gogo/protobuf/proto"

const (
        // Compound message header overhead 1 byte(message type) + 4
        // bytes (num messages)
        compoundHeaderOverhead = 5

        // Overhead for each embedded message in a compound message 4
        // bytes (len of embedded message)
        compoundOverhead = 4
)

func encodeRawMessage(t MessageType, raw []byte) ([]byte, error) {
        gMsg := GossipMessage{
                Type: t,
                Data: raw,
        }

        buf, err := proto.Marshal(&gMsg)
        if err != nil {
                return nil, err
        }

        return buf, nil
}

func encodeMessage(t MessageType, msg interface{}) ([]byte, error) {
        buf, err := proto.Marshal(msg.(proto.Message))
        if err != nil {
                return nil, err
        }

        buf, err = encodeRawMessage(t, buf)
        if err != nil {
                return nil, err
        }

        return buf, nil
}

func decodeMessage(buf []byte) (MessageType, []byte, error) {
        var gMsg GossipMessage

        err := proto.Unmarshal(buf, &gMsg)
        if err != nil {
                return MessageTypeInvalid, nil, err
        }

        return gMsg.Type, gMsg.Data, nil
}

// makeCompoundMessage takes a list of messages and generates
// a single compound message containing all of them
func makeCompoundMessage(msgs [][]byte) []byte {
        cMsg := CompoundMessage{}

        cMsg.Messages = make([]*CompoundMessage_SimpleMessage, 0, len(msgs))
        for _, m := range msgs {
                cMsg.Messages = append(cMsg.Messages, &CompoundMessage_SimpleMessage{
                        Payload: m,
                })
        }

        buf, err := proto.Marshal(&cMsg)
        if err != nil {
                return nil
        }

        gMsg := GossipMessage{
                Type: MessageTypeCompound,
                Data: buf,
        }

        buf, err = proto.Marshal(&gMsg)
        if err != nil {
                return nil
        }

        return buf
}

// decodeCompoundMessage splits a compound message and returns
// the slices of individual messages. Returns any potential error.
func decodeCompoundMessage(buf []byte) ([][]byte, error) {
        var cMsg CompoundMessage
        if err := proto.Unmarshal(buf, &cMsg); err != nil {
                return nil, err
        }

        parts := make([][]byte, 0, len(cMsg.Messages))
        for _, m := range cMsg.Messages {
                parts = append(parts, m.Payload)
        }

        return parts, nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package networkdb

//go:generate protoc -I=. -I=../../../vendor/ --gogofaster_out=import_path=github.com/docker/docker/daemon/libnetwork/networkdb:. networkdb.proto

import (
        "context"
        cryptorand "crypto/rand"
        "fmt"
        "math/rand/v2"
        "os"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/pkg/stringid"
        "github.com/docker/go-events"
        iradix "github.com/hashicorp/go-immutable-radix/v2"
        "github.com/hashicorp/memberlist"
        "github.com/hashicorp/serf/serf"
)

const (
        byTable int = 1 + iota
        byNetwork
)

// NetworkDB instance drives the networkdb cluster and acts the broker
// for cluster-scoped and network-scoped gossip and watches.
type NetworkDB struct {
        // The clocks MUST be the first things
        // in this struct due to Golang issue #599.

        // Global lamport clock for node network attach events.
        networkClock serf.LamportClock

        // Global lamport clock for table events.
        tableClock serf.LamportClock

        sync.RWMutex

        // NetworkDB configuration.
        config *Config

        // All the tree index (byTable, byNetwork) that we maintain
        // the db.
        indexes map[int]*iradix.Tree[*entry]

        // Memberlist we use to drive the cluster.
        memberlist *memberlist.Memberlist

        // List of all peer nodes in the cluster not-limited to any
        // network.
        nodes map[string]*node

        // An approximation of len(nodes) that can be accessed without
        // synchronization.
        estNodes atomic.Int32

        // List of all peer nodes which have failed
        failedNodes map[string]*node

        // List of all peer nodes which have left
        leftNodes map[string]*node

        // A multi-dimensional map of network/node attachments for peer nodes.
        // The first key is a node name and the second key is a network ID for
        // the network that node is participating in.
        networks map[string]map[string]*network

        // A map of this node's network attachments.
        thisNodeNetworks map[string]*thisNodeNetwork

        // A map of nodes which are participating in a given
        // network. The key is a network ID.
        networkNodes map[string][]string

        // A table of ack channels for every node from which we are
        // waiting for an ack.
        bulkSyncAckTbl map[string]chan struct{}

        // Broadcast queue for network event gossip.
        networkBroadcasts *memberlist.TransmitLimitedQueue

        // Broadcast queue for node event gossip.
        nodeBroadcasts *memberlist.TransmitLimitedQueue

        // A central context to stop all go routines running on
        // behalf of the NetworkDB instance.
        ctx       context.Context
        cancelCtx context.CancelFunc

        // A central broadcaster for all local watchers watching table
        // events.
        broadcaster *events.Broadcaster

        // List of all tickers which needed to be stopped when
        // cleaning up.
        tickers []*time.Ticker

        // Reference to the memberlist's keyring to add & remove keys
        keyring *memberlist.Keyring

        // bootStrapIP is the list of IPs that can be used to bootstrap
        // the gossip.
        bootStrapIP []string

        // lastStatsTimestamp is the last timestamp when the stats got printed
        lastStatsTimestamp time.Time

        // lastHealthTimestamp is the last timestamp when the health score got printed
        lastHealthTimestamp time.Time

        rngMu sync.Mutex
        rng   *rand.Rand
}

// PeerInfo represents the peer (gossip cluster) nodes of a network
type PeerInfo struct {
        Name string
        IP   string
}

// PeerClusterInfo represents the peer (gossip cluster) nodes
type PeerClusterInfo struct {
        PeerInfo
}

type node struct {
        memberlist.Node
        ltime serf.LamportTime
        // Number of hours left before the reaper removes the node
        reapTime time.Duration
}

// network describes the node/network attachment.
type network struct {
        // Lamport time for the latest state of the entry.
        ltime serf.LamportTime

        // Node leave is in progress.
        leaving bool

        // Number of seconds still left before a deleted network entry gets
        // removed from networkDB
        reapTime time.Duration
}

// thisNodeNetwork describes a network attachment on the local node.
type thisNodeNetwork struct {
        network

        // Gets set to true after the first bulk sync happens
        inSync bool

        // The broadcast queue for this network's table event gossip
        // for entries owned by this node.
        tableBroadcasts *memberlist.TransmitLimitedQueue

        // The broadcast queue for this network's table event gossip
        // relayed from other nodes.
        //
        // Messages in this queue are broadcasted when there is space available
        // in the gossip packet after filling it with tableBroadcast messages.
        // Relayed messages are broadcasted at a lower priority than messages
        // originating from this node to ensure that local messages are always
        // broadcasted in a timely manner, irrespective of how many messages
        // from other nodes are queued for rebroadcasting.
        tableRebroadcasts *memberlist.TransmitLimitedQueue

        // Number of gossip messages sent related to this network during the last stats collection period
        qMessagesSent atomic.Int64

        // Number of entries on the network. This value is the sum of all the entries of all the tables of a specific network.
        // Its use is for statistics purposes. It keep tracks of database size and is printed per network every StatsPrintPeriod
        // interval
        entriesNumber atomic.Int64

        // An approximation of len(nDB.networkNodes[nid]) that can be accessed
        // without synchronization.
        networkNodes atomic.Int32
}

// Config represents the configuration of the networkdb instance and
// can be passed by the caller.
type Config struct {
        // NodeID is the node unique identifier of the node when is part of the cluster
        NodeID string

        // Hostname is the node hostname.
        Hostname string

        // BindAddr is the IP on which networkdb listens. It can be
        // 0.0.0.0 to listen on all addresses on the host.
        BindAddr string

        // AdvertiseAddr is the node's IP address that we advertise for
        // cluster communication.
        AdvertiseAddr string

        // BindPort is the local node's port to which we bind to for
        // cluster communication.
        BindPort int

        // Keys to be added to the Keyring of the memberlist. Key at index
        // 0 is the primary key
        Keys [][]byte

        // PacketBufferSize is the maximum number of bytes that memberlist will
        // put in a packet (this will be for UDP packets by default with a NetTransport).
        // A safe value for this is typically 1400 bytes (which is the default). However,
        // depending on your network's MTU (Maximum Transmission Unit) you may
        // be able to increase this to get more content into each gossip packet.
        PacketBufferSize int

        // reapEntryInterval duration of a deleted entry before being garbage collected
        reapEntryInterval time.Duration

        // reapNetworkInterval duration of a deleted network before being garbage collected
        // NOTE this MUST always be higher than reapEntryInterval
        reapNetworkInterval time.Duration

        // rejoinClusterDuration represents retryJoin timeout used by rejoinClusterBootStrap.
        // Default is 10sec.
        rejoinClusterDuration time.Duration

        // rejoinClusterInterval represents interval on which rejoinClusterBootStrap runs.
        // Default is 60sec.
        rejoinClusterInterval time.Duration

        // StatsPrintPeriod the period to use to print queue stats
        // Default is 5min
        StatsPrintPeriod time.Duration

        // HealthPrintPeriod the period to use to print the health score
        // Default is 1min
        HealthPrintPeriod time.Duration
}

// entry defines a table entry
type entry struct {
        // node from which this entry was learned.
        node string

        // Lamport time for the most recent update to the entry
        ltime serf.LamportTime

        // Opaque value store in the entry
        value []byte

        // Deleting the entry is in progress. All entries linger in
        // the cluster for certain amount of time after deletion.
        deleting bool

        // Number of seconds still left before a deleted table entry gets
        // removed from networkDB
        reapTime time.Duration
}

// DefaultConfig returns a NetworkDB config with default values
func DefaultConfig() *Config {
        hostname, _ := os.Hostname()
        return &Config{
                NodeID:                stringid.TruncateID(stringid.GenerateRandomID()),
                Hostname:              hostname,
                BindAddr:              "0.0.0.0",
                PacketBufferSize:      1400,
                StatsPrintPeriod:      5 * time.Minute,
                HealthPrintPeriod:     1 * time.Minute,
                reapEntryInterval:     30 * time.Minute,
                rejoinClusterDuration: 10 * time.Second,
                rejoinClusterInterval: 60 * time.Second,
        }
}

// New creates a new instance of NetworkDB using the Config passed by
// the caller.
func New(c *Config) (*NetworkDB, error) {
        nDB := newNetworkDB(c)
        log.G(context.TODO()).Infof("New memberlist node - Node:%v will use memberlist nodeID:%v with config:%+v", c.Hostname, c.NodeID, c)
        if err := nDB.clusterInit(); err != nil {
                return nil, err
        }

        return nDB, nil
}

func newNetworkDB(c *Config) *NetworkDB {
        // The garbage collection logic for entries leverage the presence of the network.
        // For this reason the expiration time of the network is put slightly higher than the entry expiration so that
        // there is at least 5 extra cycle to make sure that all the entries are properly deleted before deleting the network.
        c.reapNetworkInterval = c.reapEntryInterval + 5*reapPeriod

        var rngSeed [32]byte
        _, _ = cryptorand.Read(rngSeed[:]) // Documented never to return an error

        return &NetworkDB{
                config: c,
                indexes: map[int]*iradix.Tree[*entry]{
                        byTable:   iradix.New[*entry](),
                        byNetwork: iradix.New[*entry](),
                },
                networks:         make(map[string]map[string]*network),
                thisNodeNetworks: make(map[string]*thisNodeNetwork),
                nodes:            make(map[string]*node),
                failedNodes:      make(map[string]*node),
                leftNodes:        make(map[string]*node),
                networkNodes:     make(map[string][]string),
                bulkSyncAckTbl:   make(map[string]chan struct{}),
                broadcaster:      events.NewBroadcaster(),
                rng:              rand.New(rand.NewChaCha8(rngSeed)), //gosec:disable G404 -- not used in a security sensitive context
        }
}

// Join joins this NetworkDB instance with a list of peer NetworkDB
// instances passed by the caller in the form of addr:port
func (nDB *NetworkDB) Join(members []string) error {
        nDB.Lock()
        nDB.bootStrapIP = append([]string(nil), members...)
        log.G(context.TODO()).Infof("The new bootstrap node list is:%v", nDB.bootStrapIP)
        nDB.Unlock()
        return nDB.clusterJoin(members)
}

// Close destroys this NetworkDB instance by leave the cluster,
// stopping timers, canceling goroutines etc.
func (nDB *NetworkDB) Close() {
        if err := nDB.clusterLeave(); err != nil {
                log.G(context.TODO()).Errorf("%v(%v) Could not close DB: %v", nDB.config.Hostname, nDB.config.NodeID, err)
        }

        // Avoid (*Broadcaster).run goroutine leak
        nDB.broadcaster.Close()
}

// ClusterPeers returns all the gossip cluster peers.
func (nDB *NetworkDB) ClusterPeers() []PeerInfo {
        nDB.RLock()
        defer nDB.RUnlock()
        peers := make([]PeerInfo, 0, len(nDB.nodes))
        for _, node := range nDB.nodes {
                peers = append(peers, PeerInfo{
                        Name: node.Name,
                        IP:   node.Node.Addr.String(),
                })
        }
        return peers
}

// Peers returns the gossip peers for a given network.
func (nDB *NetworkDB) Peers(nid string) []PeerInfo {
        nDB.RLock()
        defer nDB.RUnlock()
        peers := make([]PeerInfo, 0, len(nDB.networkNodes[nid]))
        for _, nodeName := range nDB.networkNodes[nid] {
                if node, ok := nDB.nodes[nodeName]; ok {
                        peers = append(peers, PeerInfo{
                                Name: node.Name,
                                IP:   node.Addr.String(),
                        })
                } else {
                        // Added for testing purposes, this condition should never happen else mean that the network list
                        // is out of sync with the node list
                        peers = append(peers, PeerInfo{Name: nodeName, IP: "unknown"})
                }
        }
        return peers
}

// GetEntry retrieves the value of a table entry in a given (network,
// table, key) tuple
func (nDB *NetworkDB) GetEntry(tname, nid, key string) ([]byte, error) {
        nDB.RLock()
        defer nDB.RUnlock()
        v, err := nDB.getEntry(tname, nid, key)
        if err != nil {
                return nil, err
        }
        if v != nil && v.deleting {
                return nil, types.NotFoundErrorf("entry in table %s network id %s and key %s deleted and pending garbage collection", tname, nid, key)
        }

        // note: this panics if a nil entry was stored in the table; after
        // discussion, we decided to not gracefully handle this situation as
        // this would be an unexpected situation;
        // see https://github.com/moby/moby/pull/48157#discussion_r1674428635
        return v.value, nil
}

func (nDB *NetworkDB) getEntry(tname, nid, key string) (*entry, error) {
        e, ok := nDB.indexes[byTable].Get([]byte(fmt.Sprintf("/%s/%s/%s", tname, nid, key)))
        if !ok {
                return nil, types.NotFoundErrorf("could not get entry in table %s with network id %s and key %s", tname, nid, key)
        }

        return e, nil
}

// CreateEntry creates a table entry in NetworkDB for given (network,
// table, key) tuple and if the NetworkDB is part of the cluster
// propagates this event to the cluster. It is an error to create an
// entry for the same tuple for which there is already an existing
// entry unless the current entry is deleting state.
func (nDB *NetworkDB) CreateEntry(tname, nid, key string, value []byte) error {
        nDB.Lock()
        oldEntry, err := nDB.getEntry(tname, nid, key)
        if err == nil || (oldEntry != nil && !oldEntry.deleting) {
                nDB.Unlock()
                return fmt.Errorf("cannot create entry in table %s with network id %s and key %s, already exists", tname, nid, key)
        }

        entry := &entry{
                ltime: nDB.tableClock.Increment(),
                node:  nDB.config.NodeID,
                value: value,
        }

        nDB.createOrUpdateEntry(nid, tname, key, entry)
        nDB.Unlock()

        if err := nDB.sendTableEvent(TableEventTypeCreate, nid, tname, key, entry); err != nil {
                return fmt.Errorf("cannot send create event for table %s, %v", tname, err)
        }

        return nil
}

// UpdateEntry updates a table entry in NetworkDB for given (network,
// table, key) tuple and if the NetworkDB is part of the cluster
// propagates this event to the cluster. It is an error to update a
// non-existent entry.
func (nDB *NetworkDB) UpdateEntry(tname, nid, key string, value []byte) error {
        nDB.Lock()
        if _, err := nDB.getEntry(tname, nid, key); err != nil {
                nDB.Unlock()
                return fmt.Errorf("cannot update entry as the entry in table %s with network id %s and key %s does not exist", tname, nid, key)
        }

        entry := &entry{
                ltime: nDB.tableClock.Increment(),
                node:  nDB.config.NodeID,
                value: value,
        }

        nDB.createOrUpdateEntry(nid, tname, key, entry)
        nDB.Unlock()

        if err := nDB.sendTableEvent(TableEventTypeUpdate, nid, tname, key, entry); err != nil {
                return fmt.Errorf("cannot send table update event: %v", err)
        }

        return nil
}

// TableElem elem
type TableElem struct {
        Value []byte
        owner string
}

// GetTableByNetwork walks the networkdb by the give table and network id and
// returns a map of keys and values
func (nDB *NetworkDB) GetTableByNetwork(tname, nid string) map[string]*TableElem {
        nDB.RLock()
        root := nDB.indexes[byTable].Root()
        nDB.RUnlock()
        entries := make(map[string]*TableElem)
        root.WalkPrefix([]byte(fmt.Sprintf("/%s/%s", tname, nid)), func(k []byte, v *entry) bool {
                if v.deleting {
                        return false
                }
                key := string(k)
                key = key[strings.LastIndex(key, "/")+1:]
                entries[key] = &TableElem{Value: v.value, owner: v.node}
                return false
        })
        return entries
}

// DeleteEntry deletes a table entry in NetworkDB for given (network,
// table, key) tuple and if the NetworkDB is part of the cluster
// propagates this event to the cluster.
func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error {
        nDB.Lock()
        oldEntry, err := nDB.getEntry(tname, nid, key)
        if err != nil || oldEntry == nil || oldEntry.deleting {
                nDB.Unlock()
                return fmt.Errorf("cannot delete entry %s with network id %s and key %s "+
                        "does not exist or is already being deleted", tname, nid, key)
        }

        entry := &entry{
                ltime:    nDB.tableClock.Increment(),
                node:     nDB.config.NodeID,
                value:    oldEntry.value,
                deleting: true,
                reapTime: nDB.config.reapEntryInterval,
        }

        nDB.createOrUpdateEntry(nid, tname, key, entry)
        nDB.Unlock()

        if err := nDB.sendTableEvent(TableEventTypeDelete, nid, tname, key, entry); err != nil {
                return fmt.Errorf("cannot send table delete event: %v", err)
        }

        return nil
}

func (nDB *NetworkDB) deleteNodeFromNetworks(deletedNode string) {
        for nid, nodes := range nDB.networkNodes {
                updatedNodes := make([]string, 0, len(nodes))
                for _, node := range nodes {
                        if node == deletedNode {
                                continue
                        }

                        updatedNodes = append(updatedNodes, node)
                }

                nDB.networkNodes[nid] = updatedNodes
        }

        delete(nDB.networks, deletedNode)
}

// deleteNodeNetworkEntries deletes all table entries for a network owned by
// node from the local store.
func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
        nDB.indexes[byNetwork].Root().WalkPrefix([]byte("/"+nid),
                func(path []byte, oldEntry *entry) bool {
                        // Do nothing if the entry is owned by a remote node that is not leaving the network
                        // because the event is triggered for a node that does not own this entry.
                        if oldEntry.node != node {
                                return false
                        }
                        params := strings.Split(string(path[1:]), "/")
                        nwID, tName, key := params[0], params[1], params[2]

                        nDB.deleteEntry(nwID, tName, key)

                        // Notify to the upper layer only entries not already marked for deletion
                        if !oldEntry.deleting {
                                nDB.broadcaster.Write(makeEvent(opDelete, tName, nwID, key, oldEntry.value))
                        }
                        return false
                })
}

// deleteNodeTableEntries deletes all table entries owned by node from the local
// store, across all networks.
func (nDB *NetworkDB) deleteNodeTableEntries(node string) {
        nDB.indexes[byTable].Root().Walk(func(path []byte, oldEntry *entry) bool {
                if oldEntry.node != node {
                        return false
                }

                params := strings.Split(string(path[1:]), "/")
                tName, nwID, key := params[0], params[1], params[2]

                nDB.deleteEntry(nwID, tName, key)

                if !oldEntry.deleting {
                        nDB.broadcaster.Write(makeEvent(opDelete, tName, nwID, key, oldEntry.value))
                }
                return false
        })
}

// WalkTable walks a single table in NetworkDB and invokes the passed
// function for each entry in the table passing the network, key,
// value. The walk stops if the passed function returns a true.
func (nDB *NetworkDB) WalkTable(tname string, fn func(string, string, []byte, bool) bool) error {
        nDB.RLock()
        root := nDB.indexes[byTable].Root()
        nDB.RUnlock()
        root.WalkPrefix([]byte("/"+tname), func(path []byte, v *entry) bool {
                params := strings.Split(string(path[1:]), "/")
                nid := params[1]
                key := params[2]
                return fn(nid, key, v.value, v.deleting)
        })

        return nil
}

// JoinNetwork joins this node to a given network and propagates this
// event across the cluster. This triggers this node joining the
// sub-cluster of this network and participates in the network-scoped
// gossip and bulk sync for this network.
func (nDB *NetworkDB) JoinNetwork(nid string) error {
        ltime := nDB.networkClock.Increment()

        nDB.Lock()
        n, ok := nDB.thisNodeNetworks[nid]
        if ok {
                if !n.leaving {
                        nDB.Unlock()
                        return fmt.Errorf("networkdb: network %s is already joined", nid)
                }
                n.network = network{ltime: ltime}
                n.inSync = false
        } else {
                n = &thisNodeNetwork{
                        network: network{ltime: ltime},
                        tableBroadcasts: &memberlist.TransmitLimitedQueue{
                                RetransmitMult: 4,
                        },
                        tableRebroadcasts: &memberlist.TransmitLimitedQueue{
                                RetransmitMult: 4,
                        },
                }
                numNodes := func() int { return int(n.networkNodes.Load()) }
                n.tableBroadcasts.NumNodes = numNodes
                n.tableRebroadcasts.NumNodes = numNodes
        }
        nDB.addNetworkNode(nid, nDB.config.NodeID)

        if err := nDB.sendNetworkEvent(nid, NetworkEventTypeJoin, ltime); err != nil {
                nDB.Unlock()
                return fmt.Errorf("failed to send join network event for %s: %v", nid, err)
        }

        networkNodes := nDB.networkNodes[nid]
        n.networkNodes.Store(int32(len(networkNodes)))
        nDB.thisNodeNetworks[nid] = n
        nDB.Unlock()

        log.G(context.TODO()).Debugf("%v(%v): joined network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
        if _, err := nDB.bulkSync(networkNodes, true); err != nil {
                log.G(context.TODO()).Errorf("Error bulk syncing while joining network %s: %v", nid, err)
        }

        // Mark the network as being synced
        // note this is a best effort, we are not checking the result of the bulk sync
        nDB.Lock()
        n.inSync = true
        nDB.Unlock()

        return nil
}

// LeaveNetwork leaves this node from a given network and propagates
// this event across the cluster. This triggers this node leaving the
// sub-cluster of this network and as a result will no longer
// participate in the network-scoped gossip and bulk sync for this
// network. Also remove all the table entries for this network from
// networkdb
func (nDB *NetworkDB) LeaveNetwork(nid string) error {
        ltime := nDB.networkClock.Increment()
        if err := nDB.sendNetworkEvent(nid, NetworkEventTypeLeave, ltime); err != nil {
                return fmt.Errorf("failed to send leave network event for %s: %v", nid, err)
        }

        nDB.Lock()
        defer nDB.Unlock()

        // Remove myself from the list of the nodes participating to the network
        nDB.deleteNetworkNode(nid, nDB.config.NodeID)

        // Mark all the local entries for deletion
        // so that if we rejoin the network
        // before another node has received the network-leave notification,
        // the old entries owned by us will still be purged as expected.
        // Delete all the remote entries from our local store
        // without leaving any tombstone.
        // This ensures that we will accept the CREATE events
        // for entries owned by remote nodes
        // if we later rejoin the network.
        nDB.indexes[byNetwork].Root().WalkPrefix([]byte("/"+nid), func(path []byte, oldEntry *entry) bool {
                owned := oldEntry.node == nDB.config.NodeID
                if owned && oldEntry.deleting {
                        return false
                }

                params := strings.Split(string(path[1:]), "/")
                nwID, tName, key := params[0], params[1], params[2]
                if owned {
                        newEntry := &entry{
                                ltime:    nDB.tableClock.Increment(),
                                node:     oldEntry.node,
                                value:    oldEntry.value,
                                deleting: true,
                                reapTime: nDB.config.reapEntryInterval,
                        }
                        nDB.createOrUpdateEntry(nwID, tName, key, newEntry)
                } else {
                        nDB.deleteEntry(nwID, tName, key)
                }
                if !oldEntry.deleting {
                        nDB.broadcaster.Write(makeEvent(opDelete, tName, nwID, key, oldEntry.value))
                }
                return false
        })

        n, ok := nDB.thisNodeNetworks[nid]
        if !ok {
                return fmt.Errorf("could not find network %s while trying to leave", nid)
        }

        log.G(context.TODO()).Debugf("%v(%v): leaving network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
        n.ltime = ltime
        n.reapTime = nDB.config.reapNetworkInterval
        n.leaving = true
        return nil
}

// addNetworkNode adds the node to the list of nodes which participate
// in the passed network only if it is not already present. Caller
// should hold the NetworkDB lock while calling this
func (nDB *NetworkDB) addNetworkNode(nid string, nodeName string) {
        nodes := nDB.networkNodes[nid]
        for _, node := range nodes {
                if node == nodeName {
                        return
                }
        }

        nDB.networkNodes[nid] = append(nDB.networkNodes[nid], nodeName)
        if n, ok := nDB.thisNodeNetworks[nid]; ok {
                n.networkNodes.Store(int32(len(nDB.networkNodes[nid])))
        }
}

// Deletes the node from the list of nodes which participate in the
// passed network. Caller should hold the NetworkDB lock while calling
// this
func (nDB *NetworkDB) deleteNetworkNode(nid string, nodeName string) {
        nodes, ok := nDB.networkNodes[nid]
        if !ok || len(nodes) == 0 {
                return
        }
        newNodes := make([]string, 0, len(nodes)-1)
        for _, name := range nodes {
                if name == nodeName {
                        continue
                }
                newNodes = append(newNodes, name)
        }
        nDB.networkNodes[nid] = newNodes
        if n, ok := nDB.thisNodeNetworks[nid]; ok {
                n.networkNodes.Store(int32(len(newNodes)))
        }
}

// findCommonNetworks find the networks that both this node and the
// passed node have joined.
func (nDB *NetworkDB) findCommonNetworks(nodeName string) []string {
        nDB.RLock()
        defer nDB.RUnlock()

        var networks []string
        for nid := range nDB.thisNodeNetworks {
                if n, ok := nDB.networks[nodeName][nid]; ok {
                        if !n.leaving {
                                networks = append(networks, nid)
                        }
                }
        }

        return networks
}

func (nDB *NetworkDB) updateLocalNetworkTime() {
        nDB.Lock()
        defer nDB.Unlock()

        ltime := nDB.networkClock.Increment()
        for _, n := range nDB.thisNodeNetworks {
                n.ltime = ltime
        }
}

// createOrUpdateEntry this function handles the creation or update of entries into the local
// tree store. It is also used to keep in sync the entries number of the network (all tables are aggregated)
func (nDB *NetworkDB) createOrUpdateEntry(nid, tname, key string, v *entry) (okTable bool, okNetwork bool) {
        nDB.indexes[byTable], _, okTable = nDB.indexes[byTable].Insert([]byte(fmt.Sprintf("/%s/%s/%s", tname, nid, key)), v)
        nDB.indexes[byNetwork], _, okNetwork = nDB.indexes[byNetwork].Insert([]byte(fmt.Sprintf("/%s/%s/%s", nid, tname, key)), v)
        if !okNetwork {
                // Add only if it is an insert not an update
                n, ok := nDB.thisNodeNetworks[nid]
                if ok {
                        n.entriesNumber.Add(1)
                }
        }
        return okTable, okNetwork
}

// deleteEntry this function handles the deletion of entries into the local tree store.
// It is also used to keep in sync the entries number of the network (all tables are aggregated)
func (nDB *NetworkDB) deleteEntry(nid, tname, key string) (okTable bool, okNetwork bool) {
        nDB.indexes[byTable], _, okTable = nDB.indexes[byTable].Delete([]byte(fmt.Sprintf("/%s/%s/%s", tname, nid, key)))
        nDB.indexes[byNetwork], _, okNetwork = nDB.indexes[byNetwork].Delete([]byte(fmt.Sprintf("/%s/%s/%s", nid, tname, key)))
        if okNetwork {
                // Remove only if the delete is successful
                n, ok := nDB.thisNodeNetworks[nid]
                if ok {
                        n.entriesNumber.Add(-1)
                }
        }
        return okTable, okNetwork
}

// Code generated by protoc-gen-gogo. DO NOT EDIT.
// source: networkdb.proto

package networkdb

import (
        fmt "fmt"
        _ "github.com/gogo/protobuf/gogoproto"
        proto "github.com/gogo/protobuf/proto"
        github_com_hashicorp_serf_serf "github.com/hashicorp/serf/serf"
        io "io"
        math "math"
        math_bits "math/bits"
        reflect "reflect"
        strings "strings"
)

// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = fmt.Errorf
var _ = math.Inf

// This is a compile-time assertion to ensure that this generated file
// is compatible with the proto package it is being compiled against.
// A compilation error at this line likely means your copy of the
// proto package needs to be updated.
const _ = proto.GoGoProtoPackageIsVersion3 // please upgrade the proto package

// MessageType enum defines all the core message types that networkdb
// uses to communicate to peers.
type MessageType int32

const (
        MessageTypeInvalid MessageType = 0
        // NetworkEvent message type is used to communicate network
        // attachments on the node.
        MessageTypeNetworkEvent MessageType = 1
        // TableEvent message type is used to communicate any table
        // CRUD event that happened on the node.
        MessageTypeTableEvent MessageType = 2
        // PushPull message type is used to syncup all network
        // attachments on a peer node either during startup of this
        // node or with a random peer node periodically thereafter.
        MessageTypePushPull MessageType = 3
        // BulkSync message is used to bulksync the whole networkdb
        // state with a peer node during startup of this node or with
        // a random peer node periodically thereafter.
        MessageTypeBulkSync MessageType = 4
        // Compound message type is used to form a compound message
        // which is a pack of many message of above types, packed into
        // a single compound message.
        MessageTypeCompound MessageType = 5
        // NodeEvent message type is used to communicate node
        // join/leave events in the cluster
        MessageTypeNodeEvent MessageType = 6
)

var MessageType_name = map[int32]string{
        0: "INVALID",
        1: "NETWORK_EVENT",
        2: "TABLE_EVENT",
        3: "PUSH_PULL",
        4: "BULK_SYNC",
        5: "COMPOUND",
        6: "NODE_EVENT",
}

var MessageType_value = map[string]int32{
        "INVALID":       0,
        "NETWORK_EVENT": 1,
        "TABLE_EVENT":   2,
        "PUSH_PULL":     3,
        "BULK_SYNC":     4,
        "COMPOUND":      5,
        "NODE_EVENT":    6,
}

func (x MessageType) String() string {
        return proto.EnumName(MessageType_name, int32(x))
}

func (MessageType) EnumDescriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{0}
}

type NodeEvent_Type int32

const (
        NodeEventTypeInvalid NodeEvent_Type = 0
        // Join event is generated when this node joins the cluster.
        NodeEventTypeJoin NodeEvent_Type = 1
        // Leave event is generated when this node leaves the cluster.
        NodeEventTypeLeave NodeEvent_Type = 2
)

var NodeEvent_Type_name = map[int32]string{
        0: "INVALID",
        1: "JOIN",
        2: "LEAVE",
}

var NodeEvent_Type_value = map[string]int32{
        "INVALID": 0,
        "JOIN":    1,
        "LEAVE":   2,
}

func (x NodeEvent_Type) String() string {
        return proto.EnumName(NodeEvent_Type_name, int32(x))
}

func (NodeEvent_Type) EnumDescriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{1, 0}
}

type NetworkEvent_Type int32

const (
        NetworkEventTypeInvalid NetworkEvent_Type = 0
        // Join event is generated when this node joins a network.
        NetworkEventTypeJoin NetworkEvent_Type = 1
        // Leave event is generated when this node leaves a network.
        NetworkEventTypeLeave NetworkEvent_Type = 2
)

var NetworkEvent_Type_name = map[int32]string{
        0: "INVALID",
        1: "JOIN",
        2: "LEAVE",
}

var NetworkEvent_Type_value = map[string]int32{
        "INVALID": 0,
        "JOIN":    1,
        "LEAVE":   2,
}

func (x NetworkEvent_Type) String() string {
        return proto.EnumName(NetworkEvent_Type_name, int32(x))
}

func (NetworkEvent_Type) EnumDescriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{2, 0}
}

type TableEvent_Type int32

const (
        TableEventTypeInvalid TableEvent_Type = 0
        // Create signifies that this table entry was just
        // created.
        TableEventTypeCreate TableEvent_Type = 1
        // Update signifies that this table entry was just
        // updated.
        TableEventTypeUpdate TableEvent_Type = 2
        // Delete signifies that this table entry was just
        // updated.
        TableEventTypeDelete TableEvent_Type = 3
)

var TableEvent_Type_name = map[int32]string{
        0: "INVALID",
        1: "CREATE",
        2: "UPDATE",
        3: "DELETE",
}

var TableEvent_Type_value = map[string]int32{
        "INVALID": 0,
        "CREATE":  1,
        "UPDATE":  2,
        "DELETE":  3,
}

func (x TableEvent_Type) String() string {
        return proto.EnumName(TableEvent_Type_name, int32(x))
}

func (TableEvent_Type) EnumDescriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{5, 0}
}

// GossipMessage is a basic message header used by all messages types.
type GossipMessage struct {
        Type MessageType `protobuf:"varint,1,opt,name=type,proto3,enum=networkdb.MessageType" json:"type,omitempty"`
        Data []byte      `protobuf:"bytes,2,opt,name=data,proto3" json:"data,omitempty"`
}

func (m *GossipMessage) Reset()      { *m = GossipMessage{} }
func (*GossipMessage) ProtoMessage() {}
func (*GossipMessage) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{0}
}
func (m *GossipMessage) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *GossipMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_GossipMessage.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *GossipMessage) XXX_Merge(src proto.Message) {
        xxx_messageInfo_GossipMessage.Merge(m, src)
}
func (m *GossipMessage) XXX_Size() int {
        return m.Size()
}
func (m *GossipMessage) XXX_DiscardUnknown() {
        xxx_messageInfo_GossipMessage.DiscardUnknown(m)
}

var xxx_messageInfo_GossipMessage proto.InternalMessageInfo

func (m *GossipMessage) GetType() MessageType {
        if m != nil {
                return m.Type
        }
        return MessageTypeInvalid
}

func (m *GossipMessage) GetData() []byte {
        if m != nil {
                return m.Data
        }
        return nil
}

// NodeEvent message payload definition.
type NodeEvent struct {
        Type NodeEvent_Type `protobuf:"varint,1,opt,name=type,proto3,enum=networkdb.NodeEvent_Type" json:"type,omitempty"`
        // Lamport time using a network lamport clock indicating the
        // time this event was generated on the node where it was
        // generated.
        LTime github_com_hashicorp_serf_serf.LamportTime `protobuf:"varint,2,opt,name=l_time,json=lTime,proto3,customtype=github.com/hashicorp/serf/serf.LamportTime" json:"l_time"`
        // Source node name.
        NodeName string `protobuf:"bytes,3,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
}

func (m *NodeEvent) Reset()      { *m = NodeEvent{} }
func (*NodeEvent) ProtoMessage() {}
func (*NodeEvent) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{1}
}
func (m *NodeEvent) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *NodeEvent) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_NodeEvent.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *NodeEvent) XXX_Merge(src proto.Message) {
        xxx_messageInfo_NodeEvent.Merge(m, src)
}
func (m *NodeEvent) XXX_Size() int {
        return m.Size()
}
func (m *NodeEvent) XXX_DiscardUnknown() {
        xxx_messageInfo_NodeEvent.DiscardUnknown(m)
}

var xxx_messageInfo_NodeEvent proto.InternalMessageInfo

func (m *NodeEvent) GetType() NodeEvent_Type {
        if m != nil {
                return m.Type
        }
        return NodeEventTypeInvalid
}

func (m *NodeEvent) GetNodeName() string {
        if m != nil {
                return m.NodeName
        }
        return ""
}

// NetworkEvent message payload definition.
type NetworkEvent struct {
        Type NetworkEvent_Type `protobuf:"varint,1,opt,name=type,proto3,enum=networkdb.NetworkEvent_Type" json:"type,omitempty"`
        // Lamport time using a network lamport clock indicating the
        // time this event was generated on the node where it was
        // generated.
        LTime github_com_hashicorp_serf_serf.LamportTime `protobuf:"varint,2,opt,name=l_time,json=lTime,proto3,customtype=github.com/hashicorp/serf/serf.LamportTime" json:"l_time"`
        // Source node name.
        NodeName string `protobuf:"bytes,3,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
        // ID of the network for which the event is generated.
        NetworkID string `protobuf:"bytes,4,opt,name=network_id,json=networkId,proto3" json:"network_id,omitempty"`
}

func (m *NetworkEvent) Reset()      { *m = NetworkEvent{} }
func (*NetworkEvent) ProtoMessage() {}
func (*NetworkEvent) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{2}
}
func (m *NetworkEvent) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *NetworkEvent) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_NetworkEvent.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *NetworkEvent) XXX_Merge(src proto.Message) {
        xxx_messageInfo_NetworkEvent.Merge(m, src)
}
func (m *NetworkEvent) XXX_Size() int {
        return m.Size()
}
func (m *NetworkEvent) XXX_DiscardUnknown() {
        xxx_messageInfo_NetworkEvent.DiscardUnknown(m)
}

var xxx_messageInfo_NetworkEvent proto.InternalMessageInfo

func (m *NetworkEvent) GetType() NetworkEvent_Type {
        if m != nil {
                return m.Type
        }
        return NetworkEventTypeInvalid
}

func (m *NetworkEvent) GetNodeName() string {
        if m != nil {
                return m.NodeName
        }
        return ""
}

func (m *NetworkEvent) GetNetworkID() string {
        if m != nil {
                return m.NetworkID
        }
        return ""
}

// NetworkEntry for push pull of networks.
type NetworkEntry struct {
        // ID of the network
        NetworkID string `protobuf:"bytes,1,opt,name=network_id,json=networkId,proto3" json:"network_id,omitempty"`
        // Latest lamport time of the network attachment when this
        // network event was recorded.
        LTime github_com_hashicorp_serf_serf.LamportTime `protobuf:"varint,2,opt,name=l_time,json=lTime,proto3,customtype=github.com/hashicorp/serf/serf.LamportTime" json:"l_time"`
        // Source node name where this network attachment happened.
        NodeName string `protobuf:"bytes,3,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
        // Indicates if a leave from this network is in progress.
        Leaving bool `protobuf:"varint,4,opt,name=leaving,proto3" json:"leaving,omitempty"`
}

func (m *NetworkEntry) Reset()      { *m = NetworkEntry{} }
func (*NetworkEntry) ProtoMessage() {}
func (*NetworkEntry) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{3}
}
func (m *NetworkEntry) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *NetworkEntry) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_NetworkEntry.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *NetworkEntry) XXX_Merge(src proto.Message) {
        xxx_messageInfo_NetworkEntry.Merge(m, src)
}
func (m *NetworkEntry) XXX_Size() int {
        return m.Size()
}
func (m *NetworkEntry) XXX_DiscardUnknown() {
        xxx_messageInfo_NetworkEntry.DiscardUnknown(m)
}

var xxx_messageInfo_NetworkEntry proto.InternalMessageInfo

func (m *NetworkEntry) GetNetworkID() string {
        if m != nil {
                return m.NetworkID
        }
        return ""
}

func (m *NetworkEntry) GetNodeName() string {
        if m != nil {
                return m.NodeName
        }
        return ""
}

func (m *NetworkEntry) GetLeaving() bool {
        if m != nil {
                return m.Leaving
        }
        return false
}

// NetworkPushpull message payload definition.
type NetworkPushPull struct {
        // Lamport time when this push pull was initiated.
        LTime    github_com_hashicorp_serf_serf.LamportTime `protobuf:"varint,1,opt,name=l_time,json=lTime,proto3,customtype=github.com/hashicorp/serf/serf.LamportTime" json:"l_time"`
        Networks []*NetworkEntry                            `protobuf:"bytes,2,rep,name=networks,proto3" json:"networks,omitempty"`
        // Name of the node sending this push pull payload.
        NodeName string `protobuf:"bytes,3,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
}

func (m *NetworkPushPull) Reset()      { *m = NetworkPushPull{} }
func (*NetworkPushPull) ProtoMessage() {}
func (*NetworkPushPull) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{4}
}
func (m *NetworkPushPull) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *NetworkPushPull) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_NetworkPushPull.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *NetworkPushPull) XXX_Merge(src proto.Message) {
        xxx_messageInfo_NetworkPushPull.Merge(m, src)
}
func (m *NetworkPushPull) XXX_Size() int {
        return m.Size()
}
func (m *NetworkPushPull) XXX_DiscardUnknown() {
        xxx_messageInfo_NetworkPushPull.DiscardUnknown(m)
}

var xxx_messageInfo_NetworkPushPull proto.InternalMessageInfo

func (m *NetworkPushPull) GetNetworks() []*NetworkEntry {
        if m != nil {
                return m.Networks
        }
        return nil
}

func (m *NetworkPushPull) GetNodeName() string {
        if m != nil {
                return m.NodeName
        }
        return ""
}

// TableEvent message payload definition.
type TableEvent struct {
        Type TableEvent_Type `protobuf:"varint,1,opt,name=type,proto3,enum=networkdb.TableEvent_Type" json:"type,omitempty"`
        // Lamport time when this event was generated.
        LTime github_com_hashicorp_serf_serf.LamportTime `protobuf:"varint,2,opt,name=l_time,json=lTime,proto3,customtype=github.com/hashicorp/serf/serf.LamportTime" json:"l_time"`
        // Node name where this event originated.
        NodeName string `protobuf:"bytes,3,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
        // ID of the network to which this table entry belongs.
        NetworkID string `protobuf:"bytes,4,opt,name=network_id,json=networkId,proto3" json:"network_id,omitempty"`
        // Name of the table to which this table entry belongs.
        TableName string `protobuf:"bytes,5,opt,name=table_name,json=tableName,proto3" json:"table_name,omitempty"`
        // Entry key.
        Key string `protobuf:"bytes,6,opt,name=key,proto3" json:"key,omitempty"`
        // Entry value.
        Value []byte `protobuf:"bytes,7,opt,name=value,proto3" json:"value,omitempty"`
        // Residual reap time for the entry before getting deleted in seconds
        ResidualReapTime int32 `protobuf:"varint,8,opt,name=residual_reap_time,json=residualReapTime,proto3" json:"residual_reap_time,omitempty"`
}

func (m *TableEvent) Reset()      { *m = TableEvent{} }
func (*TableEvent) ProtoMessage() {}
func (*TableEvent) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{5}
}
func (m *TableEvent) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *TableEvent) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_TableEvent.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *TableEvent) XXX_Merge(src proto.Message) {
        xxx_messageInfo_TableEvent.Merge(m, src)
}
func (m *TableEvent) XXX_Size() int {
        return m.Size()
}
func (m *TableEvent) XXX_DiscardUnknown() {
        xxx_messageInfo_TableEvent.DiscardUnknown(m)
}

var xxx_messageInfo_TableEvent proto.InternalMessageInfo

func (m *TableEvent) GetType() TableEvent_Type {
        if m != nil {
                return m.Type
        }
        return TableEventTypeInvalid
}

func (m *TableEvent) GetNodeName() string {
        if m != nil {
                return m.NodeName
        }
        return ""
}

func (m *TableEvent) GetNetworkID() string {
        if m != nil {
                return m.NetworkID
        }
        return ""
}

func (m *TableEvent) GetTableName() string {
        if m != nil {
                return m.TableName
        }
        return ""
}

func (m *TableEvent) GetKey() string {
        if m != nil {
                return m.Key
        }
        return ""
}

func (m *TableEvent) GetValue() []byte {
        if m != nil {
                return m.Value
        }
        return nil
}

func (m *TableEvent) GetResidualReapTime() int32 {
        if m != nil {
                return m.ResidualReapTime
        }
        return 0
}

// BulkSync message payload definition.
type BulkSyncMessage struct {
        // Lamport time when this bulk sync was initiated.
        LTime github_com_hashicorp_serf_serf.LamportTime `protobuf:"varint,1,opt,name=l_time,json=lTime,proto3,customtype=github.com/hashicorp/serf/serf.LamportTime" json:"l_time"`
        // Indicates if this bulksync is a response to a bulk sync
        // request from a peer node.
        Unsolicited bool `protobuf:"varint,2,opt,name=unsolicited,proto3" json:"unsolicited,omitempty"`
        // Name of the node which is producing this bulk sync message.
        NodeName string `protobuf:"bytes,3,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"`
        // List of network names whose table entries are getting
        // bulksynced as part of the bulksync.
        Networks []string `protobuf:"bytes,4,rep,name=networks,proto3" json:"networks,omitempty"`
        // Bulksync payload
        Payload []byte `protobuf:"bytes,5,opt,name=payload,proto3" json:"payload,omitempty"`
}

func (m *BulkSyncMessage) Reset()      { *m = BulkSyncMessage{} }
func (*BulkSyncMessage) ProtoMessage() {}
func (*BulkSyncMessage) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{6}
}
func (m *BulkSyncMessage) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *BulkSyncMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_BulkSyncMessage.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *BulkSyncMessage) XXX_Merge(src proto.Message) {
        xxx_messageInfo_BulkSyncMessage.Merge(m, src)
}
func (m *BulkSyncMessage) XXX_Size() int {
        return m.Size()
}
func (m *BulkSyncMessage) XXX_DiscardUnknown() {
        xxx_messageInfo_BulkSyncMessage.DiscardUnknown(m)
}

var xxx_messageInfo_BulkSyncMessage proto.InternalMessageInfo

func (m *BulkSyncMessage) GetUnsolicited() bool {
        if m != nil {
                return m.Unsolicited
        }
        return false
}

func (m *BulkSyncMessage) GetNodeName() string {
        if m != nil {
                return m.NodeName
        }
        return ""
}

func (m *BulkSyncMessage) GetNetworks() []string {
        if m != nil {
                return m.Networks
        }
        return nil
}

func (m *BulkSyncMessage) GetPayload() []byte {
        if m != nil {
                return m.Payload
        }
        return nil
}

// Compound message payload definition.
type CompoundMessage struct {
        // A list of simple messages.
        Messages []*CompoundMessage_SimpleMessage `protobuf:"bytes,1,rep,name=messages,proto3" json:"messages,omitempty"`
}

func (m *CompoundMessage) Reset()      { *m = CompoundMessage{} }
func (*CompoundMessage) ProtoMessage() {}
func (*CompoundMessage) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{7}
}
func (m *CompoundMessage) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *CompoundMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_CompoundMessage.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *CompoundMessage) XXX_Merge(src proto.Message) {
        xxx_messageInfo_CompoundMessage.Merge(m, src)
}
func (m *CompoundMessage) XXX_Size() int {
        return m.Size()
}
func (m *CompoundMessage) XXX_DiscardUnknown() {
        xxx_messageInfo_CompoundMessage.DiscardUnknown(m)
}

var xxx_messageInfo_CompoundMessage proto.InternalMessageInfo

func (m *CompoundMessage) GetMessages() []*CompoundMessage_SimpleMessage {
        if m != nil {
                return m.Messages
        }
        return nil
}

type CompoundMessage_SimpleMessage struct {
        // Bytestring payload of a message constructed using
        // other message type definitions.
        Payload []byte `protobuf:"bytes,1,opt,name=Payload,proto3" json:"Payload,omitempty"`
}

func (m *CompoundMessage_SimpleMessage) Reset()      { *m = CompoundMessage_SimpleMessage{} }
func (*CompoundMessage_SimpleMessage) ProtoMessage() {}
func (*CompoundMessage_SimpleMessage) Descriptor() ([]byte, []int) {
        return fileDescriptor_51036566ca8c9782, []int{7, 0}
}
func (m *CompoundMessage_SimpleMessage) XXX_Unmarshal(b []byte) error {
        return m.Unmarshal(b)
}
func (m *CompoundMessage_SimpleMessage) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) {
        if deterministic {
                return xxx_messageInfo_CompoundMessage_SimpleMessage.Marshal(b, m, deterministic)
        } else {
                b = b[:cap(b)]
                n, err := m.MarshalToSizedBuffer(b)
                if err != nil {
                        return nil, err
                }
                return b[:n], nil
        }
}
func (m *CompoundMessage_SimpleMessage) XXX_Merge(src proto.Message) {
        xxx_messageInfo_CompoundMessage_SimpleMessage.Merge(m, src)
}
func (m *CompoundMessage_SimpleMessage) XXX_Size() int {
        return m.Size()
}
func (m *CompoundMessage_SimpleMessage) XXX_DiscardUnknown() {
        xxx_messageInfo_CompoundMessage_SimpleMessage.DiscardUnknown(m)
}

var xxx_messageInfo_CompoundMessage_SimpleMessage proto.InternalMessageInfo

func (m *CompoundMessage_SimpleMessage) GetPayload() []byte {
        if m != nil {
                return m.Payload
        }
        return nil
}

func init() {
        proto.RegisterEnum("networkdb.MessageType", MessageType_name, MessageType_value)
        proto.RegisterEnum("networkdb.NodeEvent_Type", NodeEvent_Type_name, NodeEvent_Type_value)
        proto.RegisterEnum("networkdb.NetworkEvent_Type", NetworkEvent_Type_name, NetworkEvent_Type_value)
        proto.RegisterEnum("networkdb.TableEvent_Type", TableEvent_Type_name, TableEvent_Type_value)
        proto.RegisterType((*GossipMessage)(nil), "networkdb.GossipMessage")
        proto.RegisterType((*NodeEvent)(nil), "networkdb.NodeEvent")
        proto.RegisterType((*NetworkEvent)(nil), "networkdb.NetworkEvent")
        proto.RegisterType((*NetworkEntry)(nil), "networkdb.NetworkEntry")
        proto.RegisterType((*NetworkPushPull)(nil), "networkdb.NetworkPushPull")
        proto.RegisterType((*TableEvent)(nil), "networkdb.TableEvent")
        proto.RegisterType((*BulkSyncMessage)(nil), "networkdb.BulkSyncMessage")
        proto.RegisterType((*CompoundMessage)(nil), "networkdb.CompoundMessage")
        proto.RegisterType((*CompoundMessage_SimpleMessage)(nil), "networkdb.CompoundMessage.SimpleMessage")
}

func init() { proto.RegisterFile("networkdb.proto", fileDescriptor_51036566ca8c9782) }

var fileDescriptor_51036566ca8c9782 = []byte{
        // 975 bytes of a gzipped FileDescriptorProto
        0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xcc, 0x56, 0xcd, 0x6e, 0xdb, 0x46,
        0x10, 0xd6, 0xea, 0xcf, 0xd2, 0x58, 0xae, 0x59, 0xc6, 0x89, 0x19, 0xa6, 0xa5, 0x58, 0xd6, 0x31,
        0x14, 0xa3, 0x91, 0x0b, 0xe7, 0x09, 0x2c, 0x89, 0x68, 0x95, 0x28, 0x94, 0x40, 0x4b, 0x2e, 0x7a,
        0x12, 0x28, 0x73, 0x23, 0x13, 0xa6, 0x48, 0x82, 0xa4, 0x54, 0xe8, 0xd4, 0xa2, 0xa7, 0x40, 0x87,
        0xa2, 0x2f, 0xa0, 0x53, 0x7a, 0xee, 0x03, 0x14, 0x3d, 0xf6, 0x90, 0x43, 0x0f, 0xe9, 0x2d, 0xe8,
        0x41, 0xa8, 0xe5, 0x17, 0xe8, 0x23, 0x14, 0x5c, 0x92, 0xd2, 0x4a, 0x36, 0x02, 0x14, 0x35, 0xd0,
        0x5c, 0xa4, 0x9d, 0x99, 0x8f, 0xb3, 0x33, 0x1f, 0xbf, 0xd9, 0x25, 0x6c, 0x5b, 0xd8, 0xff, 0xc6,
        0x76, 0x2f, 0xf4, 0x5e, 0xd9, 0x71, 0x6d, 0xdf, 0x66, 0xf3, 0x0b, 0x07, 0xff, 0xb8, 0x6f, 0xf8,
        0xe7, 0xc3, 0x5e, 0xf9, 0xcc, 0x1e, 0x1c, 0xf6, 0xed, 0xbe, 0x7d, 0x48, 0x10, 0xbd, 0xe1, 0x0b,
        0x62, 0x11, 0x83, 0xac, 0xc2, 0x27, 0xa5, 0x26, 0x6c, 0x7d, 0x61, 0x7b, 0x9e, 0xe1, 0x3c, 0xc7,
        0x9e, 0xa7, 0xf5, 0x31, 0x7b, 0x00, 0x69, 0x7f, 0xec, 0x60, 0x0e, 0x89, 0xa8, 0xf4, 0xc1, 0xd1,
        0xbd, 0xf2, 0x72, 0xab, 0x08, 0xd1, 0x1e, 0x3b, 0x58, 0x25, 0x18, 0x96, 0x85, 0xb4, 0xae, 0xf9,
        0x1a, 0x97, 0x14, 0x51, 0xa9, 0xa0, 0x92, 0xb5, 0xf4, 0x2a, 0x09, 0x79, 0xc5, 0xd6, 0xb1, 0x3c,
        0xc2, 0x96, 0xcf, 0x3e, 0x5e, 0xc9, 0x76, 0x9f, 0xca, 0xb6, 0xc0, 0x94, 0xa9, 0x84, 0x75, 0xc8,
        0x9a, 0x5d, 0xdf, 0x18, 0x60, 0x92, 0x32, 0x5d, 0x39, 0x7a, 0x3d, 0x2b, 0x26, 0xfe, 0x9c, 0x15,
        0x0f, 0xa8, 0xa6, 0xce, 0x35, 0xef, 0xdc, 0x38, 0xb3, 0x5d, 0xe7, 0xd0, 0xc3, 0xee, 0x0b, 0xf2,
        0x53, 0x6e, 0x68, 0x03, 0xc7, 0x76, 0xfd, 0xb6, 0x31, 0xc0, 0x6a, 0xc6, 0x0c, 0xfe, 0xd8, 0x07,
        0x90, 0xb7, 0x6c, 0x1d, 0x77, 0x2d, 0x6d, 0x80, 0xb9, 0x94, 0x88, 0x4a, 0x79, 0x35, 0x17, 0x38,
        0x14, 0x6d, 0x80, 0xa5, 0x6f, 0x21, 0x1d, 0xec, 0xca, 0x3e, 0x84, 0x8d, 0xba, 0x72, 0x7a, 0xdc,
        0xa8, 0xd7, 0x98, 0x04, 0xcf, 0x4d, 0xa6, 0xe2, 0xce, 0xa2, 0xac, 0x20, 0x5e, 0xb7, 0x46, 0x9a,
        0x69, 0xe8, 0x6c, 0x11, 0xd2, 0x4f, 0x9b, 0x75, 0x85, 0x41, 0xfc, 0xdd, 0xc9, 0x54, 0xfc, 0x70,
        0x05, 0xf3, 0xd4, 0x36, 0x2c, 0xf6, 0x13, 0xc8, 0x34, 0xe4, 0xe3, 0x53, 0x99, 0x49, 0xf2, 0xf7,
        0x26, 0x53, 0x91, 0x5d, 0x41, 0x34, 0xb0, 0x36, 0xc2, 0x7c, 0xe1, 0xe5, 0x2b, 0x21, 0xf1, 0xcb,
        0x4f, 0x02, 0xd9, 0x58, 0xba, 0x4c, 0x42, 0x41, 0x09, 0xb9, 0x08, 0x89, 0xfa, 0x7c, 0x85, 0xa8,
        0x8f, 0x68, 0xa2, 0x28, 0xd8, 0xff, 0xc0, 0x15, 0xfb, 0x19, 0x40, 0x54, 0x4c, 0xd7, 0xd0, 0xb9,
        0x74, 0x10, 0xad, 0x6c, 0xcd, 0x67, 0xc5, 0x7c, 0x54, 0x58, 0xbd, 0xa6, 0xc6, 0xf2, 0xab, 0xeb,
        0xd2, 0x4b, 0x14, 0x51, 0x5b, 0xa2, 0xa9, 0x7d, 0x30, 0x99, 0x8a, 0xbb, 0x74, 0x23, 0x34, 0xbb,
        0xd2, 0x82, 0xdd, 0xf0, 0x0d, 0xac, 0xc1, 0x08, 0xc1, 0x7b, 0x4b, 0x82, 0xef, 0x4f, 0xa6, 0xe2,
        0xdd, 0x75, 0xd0, 0x4d, 0x1c, 0xff, 0x8e, 0x96, 0x1c, 0x5b, 0xbe, 0x3b, 0x5e, 0xeb, 0x04, 0xbd,
        0xbb, 0x93, 0xdb, 0xe4, 0xf7, 0xd1, 0x35, 0x7e, 0x2b, 0x85, 0xf9, 0xac, 0x98, 0x53, 0x22, 0x8e,
        0x29, 0xb6, 0x39, 0xd8, 0x30, 0xb1, 0x36, 0x32, 0xac, 0x3e, 0xa1, 0x3a, 0xa7, 0xc6, 0xa6, 0xf4,
        0x2b, 0x82, 0xed, 0xa8, 0xd0, 0xd6, 0xd0, 0x3b, 0x6f, 0x0d, 0x4d, 0x93, 0xaa, 0x11, 0xfd, 0xd7,
        0x1a, 0x9f, 0x40, 0x2e, 0xea, 0xdd, 0xe3, 0x92, 0x62, 0xaa, 0xb4, 0x79, 0xb4, 0x7b, 0x83, 0x08,
        0x03, 0x1e, 0xd5, 0x05, 0xf0, 0x5f, 0x34, 0x26, 0xfd, 0x90, 0x06, 0x68, 0x6b, 0x3d, 0x33, 0x3a,
        0x18, 0xca, 0x2b, 0x7a, 0xe7, 0xa9, 0xad, 0x96, 0xa0, 0xf7, 0x5e, 0xed, 0xec, 0xc7, 0x00, 0x7e,
        0x50, 0x6e, 0x98, 0x2b, 0x43, 0x72, 0xe5, 0x89, 0x87, 0x24, 0x63, 0x20, 0x75, 0x81, 0xc7, 0x5c,
        0x96, 0xf8, 0x83, 0x25, 0xbb, 0x03, 0x99, 0x91, 0x66, 0x0e, 0x31, 0xb7, 0x41, 0x8e, 0xcc, 0xd0,
        0x60, 0x2b, 0xc0, 0xba, 0xd8, 0x33, 0xf4, 0xa1, 0x66, 0x76, 0x5d, 0xac, 0x39, 0x61, 0xa3, 0x39,
        0x11, 0x95, 0x32, 0x95, 0x9d, 0xf9, 0xac, 0xc8, 0xa8, 0x51, 0x54, 0xc5, 0x9a, 0x43, 0x5a, 0x61,
        0xdc, 0x35, 0x8f, 0xf4, 0x73, 0x3c, 0x78, 0xfb, 0xf4, 0xe0, 0x91, 0x61, 0x59, 0x32, 0x4a, 0x8f,
        0xdd, 0x1e, 0x64, 0xab, 0xaa, 0x7c, 0xdc, 0x96, 0xe3, 0xc1, 0x5b, 0x85, 0x55, 0x5d, 0xac, 0xf9,
        0x38, 0x40, 0x75, 0x5a, 0xb5, 0x00, 0x95, 0xbc, 0x09, 0xd5, 0x71, 0xf4, 0x08, 0x55, 0x93, 0x1b,
        0x72, 0x5b, 0x66, 0x52, 0x37, 0xa1, 0x6a, 0xd8, 0xc4, 0xfe, 0xfa, 0x78, 0xfe, 0x81, 0x60, 0xbb,
        0x32, 0x34, 0x2f, 0x4e, 0xc6, 0xd6, 0x59, 0x7c, 0xf9, 0xdc, 0xa2, 0x9e, 0x45, 0xd8, 0x1c, 0x5a,
        0x9e, 0x6d, 0x1a, 0x67, 0x86, 0x8f, 0x75, 0xa2, 0x9a, 0x9c, 0x4a, 0xbb, 0xde, 0xad, 0x03, 0x9e,
        0x1a, 0x87, 0xb4, 0x98, 0x22, 0xb1, 0x58, 0xf5, 0x1c, 0x6c, 0x38, 0xda, 0xd8, 0xb4, 0x35, 0x9d,
        0xbc, 0xf2, 0x82, 0x1a, 0x9b, 0xd2, 0xf7, 0x08, 0xb6, 0xab, 0xf6, 0xc0, 0xb1, 0x87, 0x96, 0x1e,
        0xf7, 0x54, 0x83, 0xdc, 0x20, 0x5c, 0x7a, 0x1c, 0x22, 0x83, 0x55, 0xa2, 0xd4, 0xbe, 0x86, 0x2e,
        0x9f, 0x18, 0x03, 0xc7, 0xc4, 0x91, 0xa5, 0x2e, 0x9e, 0xe4, 0x1f, 0xc1, 0xd6, 0x4a, 0x28, 0x28,
        0xa2, 0x15, 0x15, 0x81, 0xc2, 0x22, 0x22, 0xf3, 0xe0, 0xb7, 0x24, 0x6c, 0x52, 0x77, 0x35, 0xfb,
        0x29, 0x2d, 0x08, 0x72, 0x3d, 0x51, 0xd1, 0x58, 0x0d, 0x65, 0xd8, 0x52, 0xe4, 0xf6, 0x57, 0x4d,
        0xf5, 0x59, 0x57, 0x3e, 0x95, 0x95, 0x36, 0x83, 0xc2, 0x43, 0x9b, 0x82, 0xae, 0xdc, 0x57, 0x07,
        0xb0, 0xd9, 0x3e, 0xae, 0x34, 0xe4, 0x08, 0x1d, 0x1d, 0xcb, 0x14, 0x9a, 0x9a, 0xf5, 0x7d, 0xc8,
        0xb7, 0x3a, 0x27, 0x5f, 0x76, 0x5b, 0x9d, 0x46, 0x83, 0x49, 0xf1, 0xbb, 0x93, 0xa9, 0x78, 0x87,
        0x42, 0x2e, 0x4e, 0xb3, 0x7d, 0xc8, 0x57, 0x3a, 0x8d, 0x67, 0xdd, 0x93, 0xaf, 0x95, 0x2a, 0x93,
        0xbe, 0x86, 0x8b, 0xc5, 0xc2, 0x3e, 0x84, 0x5c, 0xb5, 0xf9, 0xbc, 0xd5, 0xec, 0x28, 0x35, 0x26,
        0x73, 0x0d, 0x16, 0x33, 0xca, 0x96, 0x00, 0x94, 0x66, 0x2d, 0xae, 0x30, 0x1b, 0x0a, 0x93, 0xee,
        0x27, 0xbe, 0xa4, 0xf9, 0x3b, 0x91, 0x30, 0x69, 0xda, 0x2a, 0x7b, 0x6f, 0x2f, 0x85, 0xc4, 0xdf,
        0x97, 0x02, 0xfa, 0x6e, 0x2e, 0xa0, 0xd7, 0x73, 0x01, 0xbd, 0x99, 0x0b, 0xe8, 0xaf, 0xb9, 0x80,
        0x7e, 0xbc, 0x12, 0x12, 0x6f, 0xae, 0x84, 0xc4, 0xdb, 0x2b, 0x21, 0xd1, 0xcb, 0x92, 0xcf, 0xa8,
        0x27, 0xff, 0x04, 0x00, 0x00, 0xff, 0xff, 0x82, 0xf0, 0x4d, 0x63, 0x93, 0x09, 0x00, 0x00,
}

func (this *GossipMessage) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 6)
        s = append(s, "&networkdb.GossipMessage{")
        s = append(s, "Type: "+fmt.Sprintf("%#v", this.Type)+",\n")
        s = append(s, "Data: "+fmt.Sprintf("%#v", this.Data)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *NodeEvent) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 7)
        s = append(s, "&networkdb.NodeEvent{")
        s = append(s, "Type: "+fmt.Sprintf("%#v", this.Type)+",\n")
        s = append(s, "LTime: "+fmt.Sprintf("%#v", this.LTime)+",\n")
        s = append(s, "NodeName: "+fmt.Sprintf("%#v", this.NodeName)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *NetworkEvent) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 8)
        s = append(s, "&networkdb.NetworkEvent{")
        s = append(s, "Type: "+fmt.Sprintf("%#v", this.Type)+",\n")
        s = append(s, "LTime: "+fmt.Sprintf("%#v", this.LTime)+",\n")
        s = append(s, "NodeName: "+fmt.Sprintf("%#v", this.NodeName)+",\n")
        s = append(s, "NetworkID: "+fmt.Sprintf("%#v", this.NetworkID)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *NetworkEntry) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 8)
        s = append(s, "&networkdb.NetworkEntry{")
        s = append(s, "NetworkID: "+fmt.Sprintf("%#v", this.NetworkID)+",\n")
        s = append(s, "LTime: "+fmt.Sprintf("%#v", this.LTime)+",\n")
        s = append(s, "NodeName: "+fmt.Sprintf("%#v", this.NodeName)+",\n")
        s = append(s, "Leaving: "+fmt.Sprintf("%#v", this.Leaving)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *NetworkPushPull) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 7)
        s = append(s, "&networkdb.NetworkPushPull{")
        s = append(s, "LTime: "+fmt.Sprintf("%#v", this.LTime)+",\n")
        if this.Networks != nil {
                s = append(s, "Networks: "+fmt.Sprintf("%#v", this.Networks)+",\n")
        }
        s = append(s, "NodeName: "+fmt.Sprintf("%#v", this.NodeName)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *TableEvent) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 12)
        s = append(s, "&networkdb.TableEvent{")
        s = append(s, "Type: "+fmt.Sprintf("%#v", this.Type)+",\n")
        s = append(s, "LTime: "+fmt.Sprintf("%#v", this.LTime)+",\n")
        s = append(s, "NodeName: "+fmt.Sprintf("%#v", this.NodeName)+",\n")
        s = append(s, "NetworkID: "+fmt.Sprintf("%#v", this.NetworkID)+",\n")
        s = append(s, "TableName: "+fmt.Sprintf("%#v", this.TableName)+",\n")
        s = append(s, "Key: "+fmt.Sprintf("%#v", this.Key)+",\n")
        s = append(s, "Value: "+fmt.Sprintf("%#v", this.Value)+",\n")
        s = append(s, "ResidualReapTime: "+fmt.Sprintf("%#v", this.ResidualReapTime)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *BulkSyncMessage) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 9)
        s = append(s, "&networkdb.BulkSyncMessage{")
        s = append(s, "LTime: "+fmt.Sprintf("%#v", this.LTime)+",\n")
        s = append(s, "Unsolicited: "+fmt.Sprintf("%#v", this.Unsolicited)+",\n")
        s = append(s, "NodeName: "+fmt.Sprintf("%#v", this.NodeName)+",\n")
        s = append(s, "Networks: "+fmt.Sprintf("%#v", this.Networks)+",\n")
        s = append(s, "Payload: "+fmt.Sprintf("%#v", this.Payload)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *CompoundMessage) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 5)
        s = append(s, "&networkdb.CompoundMessage{")
        if this.Messages != nil {
                s = append(s, "Messages: "+fmt.Sprintf("%#v", this.Messages)+",\n")
        }
        s = append(s, "}")
        return strings.Join(s, "")
}
func (this *CompoundMessage_SimpleMessage) GoString() string {
        if this == nil {
                return "nil"
        }
        s := make([]string, 0, 5)
        s = append(s, "&networkdb.CompoundMessage_SimpleMessage{")
        s = append(s, "Payload: "+fmt.Sprintf("%#v", this.Payload)+",\n")
        s = append(s, "}")
        return strings.Join(s, "")
}
func valueToGoStringNetworkdb(v interface{}, typ string) string {
        rv := reflect.ValueOf(v)
        if rv.IsNil() {
                return "nil"
        }
        pv := reflect.Indirect(rv).Interface()
        return fmt.Sprintf("func(v %v) *%v { return &v } ( %#v )", typ, typ, pv)
}
func (m *GossipMessage) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *GossipMessage) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *GossipMessage) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.Data) > 0 {
                i -= len(m.Data)
                copy(dAtA[i:], m.Data)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.Data)))
                i--
                dAtA[i] = 0x12
        }
        if m.Type != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.Type))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *NodeEvent) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *NodeEvent) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *NodeEvent) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.NodeName) > 0 {
                i -= len(m.NodeName)
                copy(dAtA[i:], m.NodeName)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NodeName)))
                i--
                dAtA[i] = 0x1a
        }
        if m.LTime != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.LTime))
                i--
                dAtA[i] = 0x10
        }
        if m.Type != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.Type))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *NetworkEvent) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *NetworkEvent) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *NetworkEvent) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.NetworkID) > 0 {
                i -= len(m.NetworkID)
                copy(dAtA[i:], m.NetworkID)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NetworkID)))
                i--
                dAtA[i] = 0x22
        }
        if len(m.NodeName) > 0 {
                i -= len(m.NodeName)
                copy(dAtA[i:], m.NodeName)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NodeName)))
                i--
                dAtA[i] = 0x1a
        }
        if m.LTime != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.LTime))
                i--
                dAtA[i] = 0x10
        }
        if m.Type != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.Type))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *NetworkEntry) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *NetworkEntry) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *NetworkEntry) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.Leaving {
                i--
                if m.Leaving {
                        dAtA[i] = 1
                } else {
                        dAtA[i] = 0
                }
                i--
                dAtA[i] = 0x20
        }
        if len(m.NodeName) > 0 {
                i -= len(m.NodeName)
                copy(dAtA[i:], m.NodeName)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NodeName)))
                i--
                dAtA[i] = 0x1a
        }
        if m.LTime != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.LTime))
                i--
                dAtA[i] = 0x10
        }
        if len(m.NetworkID) > 0 {
                i -= len(m.NetworkID)
                copy(dAtA[i:], m.NetworkID)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NetworkID)))
                i--
                dAtA[i] = 0xa
        }
        return len(dAtA) - i, nil
}

func (m *NetworkPushPull) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *NetworkPushPull) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *NetworkPushPull) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.NodeName) > 0 {
                i -= len(m.NodeName)
                copy(dAtA[i:], m.NodeName)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NodeName)))
                i--
                dAtA[i] = 0x1a
        }
        if len(m.Networks) > 0 {
                for iNdEx := len(m.Networks) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Networks[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintNetworkdb(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0x12
                }
        }
        if m.LTime != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.LTime))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *TableEvent) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *TableEvent) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *TableEvent) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if m.ResidualReapTime != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.ResidualReapTime))
                i--
                dAtA[i] = 0x40
        }
        if len(m.Value) > 0 {
                i -= len(m.Value)
                copy(dAtA[i:], m.Value)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.Value)))
                i--
                dAtA[i] = 0x3a
        }
        if len(m.Key) > 0 {
                i -= len(m.Key)
                copy(dAtA[i:], m.Key)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.Key)))
                i--
                dAtA[i] = 0x32
        }
        if len(m.TableName) > 0 {
                i -= len(m.TableName)
                copy(dAtA[i:], m.TableName)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.TableName)))
                i--
                dAtA[i] = 0x2a
        }
        if len(m.NetworkID) > 0 {
                i -= len(m.NetworkID)
                copy(dAtA[i:], m.NetworkID)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NetworkID)))
                i--
                dAtA[i] = 0x22
        }
        if len(m.NodeName) > 0 {
                i -= len(m.NodeName)
                copy(dAtA[i:], m.NodeName)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NodeName)))
                i--
                dAtA[i] = 0x1a
        }
        if m.LTime != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.LTime))
                i--
                dAtA[i] = 0x10
        }
        if m.Type != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.Type))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *BulkSyncMessage) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *BulkSyncMessage) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *BulkSyncMessage) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.Payload) > 0 {
                i -= len(m.Payload)
                copy(dAtA[i:], m.Payload)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.Payload)))
                i--
                dAtA[i] = 0x2a
        }
        if len(m.Networks) > 0 {
                for iNdEx := len(m.Networks) - 1; iNdEx >= 0; iNdEx-- {
                        i -= len(m.Networks[iNdEx])
                        copy(dAtA[i:], m.Networks[iNdEx])
                        i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.Networks[iNdEx])))
                        i--
                        dAtA[i] = 0x22
                }
        }
        if len(m.NodeName) > 0 {
                i -= len(m.NodeName)
                copy(dAtA[i:], m.NodeName)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.NodeName)))
                i--
                dAtA[i] = 0x1a
        }
        if m.Unsolicited {
                i--
                if m.Unsolicited {
                        dAtA[i] = 1
                } else {
                        dAtA[i] = 0
                }
                i--
                dAtA[i] = 0x10
        }
        if m.LTime != 0 {
                i = encodeVarintNetworkdb(dAtA, i, uint64(m.LTime))
                i--
                dAtA[i] = 0x8
        }
        return len(dAtA) - i, nil
}

func (m *CompoundMessage) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *CompoundMessage) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *CompoundMessage) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.Messages) > 0 {
                for iNdEx := len(m.Messages) - 1; iNdEx >= 0; iNdEx-- {
                        {
                                size, err := m.Messages[iNdEx].MarshalToSizedBuffer(dAtA[:i])
                                if err != nil {
                                        return 0, err
                                }
                                i -= size
                                i = encodeVarintNetworkdb(dAtA, i, uint64(size))
                        }
                        i--
                        dAtA[i] = 0xa
                }
        }
        return len(dAtA) - i, nil
}

func (m *CompoundMessage_SimpleMessage) Marshal() (dAtA []byte, err error) {
        size := m.Size()
        dAtA = make([]byte, size)
        n, err := m.MarshalToSizedBuffer(dAtA[:size])
        if err != nil {
                return nil, err
        }
        return dAtA[:n], nil
}

func (m *CompoundMessage_SimpleMessage) MarshalTo(dAtA []byte) (int, error) {
        size := m.Size()
        return m.MarshalToSizedBuffer(dAtA[:size])
}

func (m *CompoundMessage_SimpleMessage) MarshalToSizedBuffer(dAtA []byte) (int, error) {
        i := len(dAtA)
        _ = i
        var l int
        _ = l
        if len(m.Payload) > 0 {
                i -= len(m.Payload)
                copy(dAtA[i:], m.Payload)
                i = encodeVarintNetworkdb(dAtA, i, uint64(len(m.Payload)))
                i--
                dAtA[i] = 0xa
        }
        return len(dAtA) - i, nil
}

func encodeVarintNetworkdb(dAtA []byte, offset int, v uint64) int {
        offset -= sovNetworkdb(v)
        base := offset
        for v >= 1<<7 {
                dAtA[offset] = uint8(v&0x7f | 0x80)
                v >>= 7
                offset++
        }
        dAtA[offset] = uint8(v)
        return base
}
func (m *GossipMessage) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Type != 0 {
                n += 1 + sovNetworkdb(uint64(m.Type))
        }
        l = len(m.Data)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        return n
}

func (m *NodeEvent) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Type != 0 {
                n += 1 + sovNetworkdb(uint64(m.Type))
        }
        if m.LTime != 0 {
                n += 1 + sovNetworkdb(uint64(m.LTime))
        }
        l = len(m.NodeName)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        return n
}

func (m *NetworkEvent) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Type != 0 {
                n += 1 + sovNetworkdb(uint64(m.Type))
        }
        if m.LTime != 0 {
                n += 1 + sovNetworkdb(uint64(m.LTime))
        }
        l = len(m.NodeName)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        l = len(m.NetworkID)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        return n
}

func (m *NetworkEntry) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.NetworkID)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        if m.LTime != 0 {
                n += 1 + sovNetworkdb(uint64(m.LTime))
        }
        l = len(m.NodeName)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        if m.Leaving {
                n += 2
        }
        return n
}

func (m *NetworkPushPull) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.LTime != 0 {
                n += 1 + sovNetworkdb(uint64(m.LTime))
        }
        if len(m.Networks) > 0 {
                for _, e := range m.Networks {
                        l = e.Size()
                        n += 1 + l + sovNetworkdb(uint64(l))
                }
        }
        l = len(m.NodeName)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        return n
}

func (m *TableEvent) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.Type != 0 {
                n += 1 + sovNetworkdb(uint64(m.Type))
        }
        if m.LTime != 0 {
                n += 1 + sovNetworkdb(uint64(m.LTime))
        }
        l = len(m.NodeName)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        l = len(m.NetworkID)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        l = len(m.TableName)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        l = len(m.Key)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        l = len(m.Value)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        if m.ResidualReapTime != 0 {
                n += 1 + sovNetworkdb(uint64(m.ResidualReapTime))
        }
        return n
}

func (m *BulkSyncMessage) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if m.LTime != 0 {
                n += 1 + sovNetworkdb(uint64(m.LTime))
        }
        if m.Unsolicited {
                n += 2
        }
        l = len(m.NodeName)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        if len(m.Networks) > 0 {
                for _, s := range m.Networks {
                        l = len(s)
                        n += 1 + l + sovNetworkdb(uint64(l))
                }
        }
        l = len(m.Payload)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        return n
}

func (m *CompoundMessage) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        if len(m.Messages) > 0 {
                for _, e := range m.Messages {
                        l = e.Size()
                        n += 1 + l + sovNetworkdb(uint64(l))
                }
        }
        return n
}

func (m *CompoundMessage_SimpleMessage) Size() (n int) {
        if m == nil {
                return 0
        }
        var l int
        _ = l
        l = len(m.Payload)
        if l > 0 {
                n += 1 + l + sovNetworkdb(uint64(l))
        }
        return n
}

func sovNetworkdb(x uint64) (n int) {
        return (math_bits.Len64(x|1) + 6) / 7
}
func sozNetworkdb(x uint64) (n int) {
        return sovNetworkdb(uint64((x << 1) ^ uint64((int64(x) >> 63))))
}
func (this *GossipMessage) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&GossipMessage{`,
                `Type:` + fmt.Sprintf("%v", this.Type) + `,`,
                `Data:` + fmt.Sprintf("%v", this.Data) + `,`,
                `}`,
        }, "")
        return s
}
func (this *NodeEvent) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&NodeEvent{`,
                `Type:` + fmt.Sprintf("%v", this.Type) + `,`,
                `LTime:` + fmt.Sprintf("%v", this.LTime) + `,`,
                `NodeName:` + fmt.Sprintf("%v", this.NodeName) + `,`,
                `}`,
        }, "")
        return s
}
func (this *NetworkEvent) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&NetworkEvent{`,
                `Type:` + fmt.Sprintf("%v", this.Type) + `,`,
                `LTime:` + fmt.Sprintf("%v", this.LTime) + `,`,
                `NodeName:` + fmt.Sprintf("%v", this.NodeName) + `,`,
                `NetworkID:` + fmt.Sprintf("%v", this.NetworkID) + `,`,
                `}`,
        }, "")
        return s
}
func (this *NetworkEntry) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&NetworkEntry{`,
                `NetworkID:` + fmt.Sprintf("%v", this.NetworkID) + `,`,
                `LTime:` + fmt.Sprintf("%v", this.LTime) + `,`,
                `NodeName:` + fmt.Sprintf("%v", this.NodeName) + `,`,
                `Leaving:` + fmt.Sprintf("%v", this.Leaving) + `,`,
                `}`,
        }, "")
        return s
}
func (this *NetworkPushPull) String() string {
        if this == nil {
                return "nil"
        }
        repeatedStringForNetworks := "[]*NetworkEntry{"
        for _, f := range this.Networks {
                repeatedStringForNetworks += strings.Replace(f.String(), "NetworkEntry", "NetworkEntry", 1) + ","
        }
        repeatedStringForNetworks += "}"
        s := strings.Join([]string{`&NetworkPushPull{`,
                `LTime:` + fmt.Sprintf("%v", this.LTime) + `,`,
                `Networks:` + repeatedStringForNetworks + `,`,
                `NodeName:` + fmt.Sprintf("%v", this.NodeName) + `,`,
                `}`,
        }, "")
        return s
}
func (this *TableEvent) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&TableEvent{`,
                `Type:` + fmt.Sprintf("%v", this.Type) + `,`,
                `LTime:` + fmt.Sprintf("%v", this.LTime) + `,`,
                `NodeName:` + fmt.Sprintf("%v", this.NodeName) + `,`,
                `NetworkID:` + fmt.Sprintf("%v", this.NetworkID) + `,`,
                `TableName:` + fmt.Sprintf("%v", this.TableName) + `,`,
                `Key:` + fmt.Sprintf("%v", this.Key) + `,`,
                `Value:` + fmt.Sprintf("%v", this.Value) + `,`,
                `ResidualReapTime:` + fmt.Sprintf("%v", this.ResidualReapTime) + `,`,
                `}`,
        }, "")
        return s
}
func (this *BulkSyncMessage) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&BulkSyncMessage{`,
                `LTime:` + fmt.Sprintf("%v", this.LTime) + `,`,
                `Unsolicited:` + fmt.Sprintf("%v", this.Unsolicited) + `,`,
                `NodeName:` + fmt.Sprintf("%v", this.NodeName) + `,`,
                `Networks:` + fmt.Sprintf("%v", this.Networks) + `,`,
                `Payload:` + fmt.Sprintf("%v", this.Payload) + `,`,
                `}`,
        }, "")
        return s
}
func (this *CompoundMessage) String() string {
        if this == nil {
                return "nil"
        }
        repeatedStringForMessages := "[]*CompoundMessage_SimpleMessage{"
        for _, f := range this.Messages {
                repeatedStringForMessages += strings.Replace(fmt.Sprintf("%v", f), "CompoundMessage_SimpleMessage", "CompoundMessage_SimpleMessage", 1) + ","
        }
        repeatedStringForMessages += "}"
        s := strings.Join([]string{`&CompoundMessage{`,
                `Messages:` + repeatedStringForMessages + `,`,
                `}`,
        }, "")
        return s
}
func (this *CompoundMessage_SimpleMessage) String() string {
        if this == nil {
                return "nil"
        }
        s := strings.Join([]string{`&CompoundMessage_SimpleMessage{`,
                `Payload:` + fmt.Sprintf("%v", this.Payload) + `,`,
                `}`,
        }, "")
        return s
}
func valueToStringNetworkdb(v interface{}) string {
        rv := reflect.ValueOf(v)
        if rv.IsNil() {
                return "nil"
        }
        pv := reflect.Indirect(rv).Interface()
        return fmt.Sprintf("*%v", pv)
}
func (m *GossipMessage) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: GossipMessage: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: GossipMessage: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType)
                        }
                        m.Type = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.Type |= MessageType(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Data", wireType)
                        }
                        var byteLen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                byteLen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if byteLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + byteLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Data = append(m.Data[:0], dAtA[iNdEx:postIndex]...)
                        if m.Data == nil {
                                m.Data = []byte{}
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *NodeEvent) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: NodeEvent: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: NodeEvent: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType)
                        }
                        m.Type = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.Type |= NodeEvent_Type(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field LTime", wireType)
                        }
                        m.LTime = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.LTime |= github_com_hashicorp_serf_serf.LamportTime(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NodeName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NodeName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *NetworkEvent) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: NetworkEvent: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: NetworkEvent: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType)
                        }
                        m.Type = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.Type |= NetworkEvent_Type(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field LTime", wireType)
                        }
                        m.LTime = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.LTime |= github_com_hashicorp_serf_serf.LamportTime(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NodeName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NodeName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 4:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NetworkID", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NetworkID = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *NetworkEntry) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: NetworkEntry: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: NetworkEntry: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NetworkID", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NetworkID = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 2:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field LTime", wireType)
                        }
                        m.LTime = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.LTime |= github_com_hashicorp_serf_serf.LamportTime(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NodeName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NodeName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 4:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Leaving", wireType)
                        }
                        var v int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                v |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        m.Leaving = bool(v != 0)
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *NetworkPushPull) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: NetworkPushPull: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: NetworkPushPull: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field LTime", wireType)
                        }
                        m.LTime = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.LTime |= github_com_hashicorp_serf_serf.LamportTime(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Networks", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Networks = append(m.Networks, &NetworkEntry{})
                        if err := m.Networks[len(m.Networks)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NodeName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NodeName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *TableEvent) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: TableEvent: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: TableEvent: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Type", wireType)
                        }
                        m.Type = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.Type |= TableEvent_Type(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field LTime", wireType)
                        }
                        m.LTime = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.LTime |= github_com_hashicorp_serf_serf.LamportTime(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NodeName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NodeName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 4:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NetworkID", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NetworkID = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 5:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field TableName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.TableName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 6:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Key = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 7:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType)
                        }
                        var byteLen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                byteLen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if byteLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + byteLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Value = append(m.Value[:0], dAtA[iNdEx:postIndex]...)
                        if m.Value == nil {
                                m.Value = []byte{}
                        }
                        iNdEx = postIndex
                case 8:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field ResidualReapTime", wireType)
                        }
                        m.ResidualReapTime = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.ResidualReapTime |= int32(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *BulkSyncMessage) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: BulkSyncMessage: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: BulkSyncMessage: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field LTime", wireType)
                        }
                        m.LTime = 0
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                m.LTime |= github_com_hashicorp_serf_serf.LamportTime(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                case 2:
                        if wireType != 0 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Unsolicited", wireType)
                        }
                        var v int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                v |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        m.Unsolicited = bool(v != 0)
                case 3:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field NodeName", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.NodeName = string(dAtA[iNdEx:postIndex])
                        iNdEx = postIndex
                case 4:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Networks", wireType)
                        }
                        var stringLen uint64
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                stringLen |= uint64(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        intStringLen := int(stringLen)
                        if intStringLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + intStringLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Networks = append(m.Networks, string(dAtA[iNdEx:postIndex]))
                        iNdEx = postIndex
                case 5:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Payload", wireType)
                        }
                        var byteLen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                byteLen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if byteLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + byteLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Payload = append(m.Payload[:0], dAtA[iNdEx:postIndex]...)
                        if m.Payload == nil {
                                m.Payload = []byte{}
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *CompoundMessage) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: CompoundMessage: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: CompoundMessage: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Messages", wireType)
                        }
                        var msglen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                msglen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if msglen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + msglen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Messages = append(m.Messages, &CompoundMessage_SimpleMessage{})
                        if err := m.Messages[len(m.Messages)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil {
                                return err
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func (m *CompoundMessage_SimpleMessage) Unmarshal(dAtA []byte) error {
        l := len(dAtA)
        iNdEx := 0
        for iNdEx < l {
                preIndex := iNdEx
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= uint64(b&0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                fieldNum := int32(wire >> 3)
                wireType := int(wire & 0x7)
                if wireType == 4 {
                        return fmt.Errorf("proto: SimpleMessage: wiretype end group for non-group")
                }
                if fieldNum <= 0 {
                        return fmt.Errorf("proto: SimpleMessage: illegal tag %d (wire type %d)", fieldNum, wire)
                }
                switch fieldNum {
                case 1:
                        if wireType != 2 {
                                return fmt.Errorf("proto: wrong wireType = %d for field Payload", wireType)
                        }
                        var byteLen int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                byteLen |= int(b&0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if byteLen < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        postIndex := iNdEx + byteLen
                        if postIndex < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if postIndex > l {
                                return io.ErrUnexpectedEOF
                        }
                        m.Payload = append(m.Payload[:0], dAtA[iNdEx:postIndex]...)
                        if m.Payload == nil {
                                m.Payload = []byte{}
                        }
                        iNdEx = postIndex
                default:
                        iNdEx = preIndex
                        skippy, err := skipNetworkdb(dAtA[iNdEx:])
                        if err != nil {
                                return err
                        }
                        if (skippy < 0) || (iNdEx+skippy) < 0 {
                                return ErrInvalidLengthNetworkdb
                        }
                        if (iNdEx + skippy) > l {
                                return io.ErrUnexpectedEOF
                        }
                        iNdEx += skippy
                }
        }

        if iNdEx > l {
                return io.ErrUnexpectedEOF
        }
        return nil
}
func skipNetworkdb(dAtA []byte) (n int, err error) {
        l := len(dAtA)
        iNdEx := 0
        depth := 0
        for iNdEx < l {
                var wire uint64
                for shift := uint(0); ; shift += 7 {
                        if shift >= 64 {
                                return 0, ErrIntOverflowNetworkdb
                        }
                        if iNdEx >= l {
                                return 0, io.ErrUnexpectedEOF
                        }
                        b := dAtA[iNdEx]
                        iNdEx++
                        wire |= (uint64(b) & 0x7F) << shift
                        if b < 0x80 {
                                break
                        }
                }
                wireType := int(wire & 0x7)
                switch wireType {
                case 0:
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                iNdEx++
                                if dAtA[iNdEx-1] < 0x80 {
                                        break
                                }
                        }
                case 1:
                        iNdEx += 8
                case 2:
                        var length int
                        for shift := uint(0); ; shift += 7 {
                                if shift >= 64 {
                                        return 0, ErrIntOverflowNetworkdb
                                }
                                if iNdEx >= l {
                                        return 0, io.ErrUnexpectedEOF
                                }
                                b := dAtA[iNdEx]
                                iNdEx++
                                length |= (int(b) & 0x7F) << shift
                                if b < 0x80 {
                                        break
                                }
                        }
                        if length < 0 {
                                return 0, ErrInvalidLengthNetworkdb
                        }
                        iNdEx += length
                case 3:
                        depth++
                case 4:
                        if depth == 0 {
                                return 0, ErrUnexpectedEndOfGroupNetworkdb
                        }
                        depth--
                case 5:
                        iNdEx += 4
                default:
                        return 0, fmt.Errorf("proto: illegal wireType %d", wireType)
                }
                if iNdEx < 0 {
                        return 0, ErrInvalidLengthNetworkdb
                }
                if depth == 0 {
                        return iNdEx, nil
                }
        }
        return 0, io.ErrUnexpectedEOF
}

var (
        ErrInvalidLengthNetworkdb        = fmt.Errorf("proto: negative length found during unmarshaling")
        ErrIntOverflowNetworkdb          = fmt.Errorf("proto: integer overflow")
        ErrUnexpectedEndOfGroupNetworkdb = fmt.Errorf("proto: unexpected end of group")
)

package networkdb

import (
        "context"
        "encoding/base64"
        "fmt"
        "net/http"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/diagnostic"
        "github.com/docker/docker/daemon/libnetwork/internal/caller"
)

const (
        missingParameter = "missing parameter"
        dbNotAvailable   = "database not available"
)

type Mux interface {
        HandleFunc(pattern string, handler func(http.ResponseWriter, *http.Request))
}

func (nDB *NetworkDB) RegisterDiagnosticHandlers(m Mux) {
        m.HandleFunc("/join", nDB.dbJoin)
        m.HandleFunc("/networkpeers", nDB.dbPeers)
        m.HandleFunc("/clusterpeers", nDB.dbClusterPeers)
        m.HandleFunc("/joinnetwork", nDB.dbJoinNetwork)
        m.HandleFunc("/leavenetwork", nDB.dbLeaveNetwork)
        m.HandleFunc("/createentry", nDB.dbCreateEntry)
        m.HandleFunc("/updateentry", nDB.dbUpdateEntry)
        m.HandleFunc("/deleteentry", nDB.dbDeleteEntry)
        m.HandleFunc("/getentry", nDB.dbGetEntry)
        m.HandleFunc("/gettable", nDB.dbGetTable)
        m.HandleFunc("/networkstats", nDB.dbNetworkStats)
}

func (nDB *NetworkDB) dbJoin(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        _, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("join cluster")

        if len(r.Form["members"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?members=ip1,ip2,...", r.URL.Path))
                logger.Error("join cluster failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        err := nDB.Join(strings.Split(r.Form["members"][0], ","))
        if err != nil {
                rsp := diagnostic.FailCommand(fmt.Errorf("%s error in the DB join %s", r.URL.Path, err))
                logger.WithError(err).Error("join cluster failed")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        logger.Info("join cluster done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(nil), json)
}

func (nDB *NetworkDB) dbPeers(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        _, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("network peers")

        if len(r.Form["nid"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?nid=test", r.URL.Path))
                logger.Error("network peers failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        peers := nDB.Peers(r.Form["nid"][0])
        rsp := &diagnostic.TableObj{Length: len(peers)}
        for i, peerInfo := range peers {
                if peerInfo.IP == "unknown" {
                        rsp.Elements = append(rsp.Elements, &diagnostic.PeerEntryObj{Index: i, Name: "orphan-" + peerInfo.Name, IP: peerInfo.IP})
                } else {
                        rsp.Elements = append(rsp.Elements, &diagnostic.PeerEntryObj{Index: i, Name: peerInfo.Name, IP: peerInfo.IP})
                }
        }
        logger.WithField("response", fmt.Sprintf("%+v", rsp)).Info("network peers done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(rsp), json)
}

func (nDB *NetworkDB) dbClusterPeers(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        _, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("cluster peers")

        peers := nDB.ClusterPeers()
        rsp := &diagnostic.TableObj{Length: len(peers)}
        for i, peerInfo := range peers {
                rsp.Elements = append(rsp.Elements, &diagnostic.PeerEntryObj{Index: i, Name: peerInfo.Name, IP: peerInfo.IP})
        }
        logger.WithField("response", fmt.Sprintf("%+v", rsp)).Info("cluster peers done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(rsp), json)
}

func (nDB *NetworkDB) dbCreateEntry(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        unsafe, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("create entry")

        if len(r.Form["tname"]) < 1 ||
                len(r.Form["nid"]) < 1 ||
                len(r.Form["key"]) < 1 ||
                len(r.Form["value"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?tname=table_name&nid=network_id&key=k&value=v", r.URL.Path))
                logger.Error("create entry failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        tname := r.Form["tname"][0]
        nid := r.Form["nid"][0]
        key := r.Form["key"][0]
        value := r.Form["value"][0]
        decodedValue := []byte(value)
        if !unsafe {
                var err error
                decodedValue, err = base64.StdEncoding.DecodeString(value)
                if err != nil {
                        logger.WithError(err).Error("create entry failed")
                        diagnostic.HTTPReply(w, diagnostic.FailCommand(err), json)
                        return
                }
        }

        if err := nDB.CreateEntry(tname, nid, key, decodedValue); err != nil {
                rsp := diagnostic.FailCommand(err)
                diagnostic.HTTPReply(w, rsp, json)
                logger.WithError(err).Error("create entry failed")
                return
        }
        logger.Info("create entry done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(nil), json)
}

func (nDB *NetworkDB) dbUpdateEntry(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        unsafe, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("update entry")

        if len(r.Form["tname"]) < 1 ||
                len(r.Form["nid"]) < 1 ||
                len(r.Form["key"]) < 1 ||
                len(r.Form["value"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?tname=table_name&nid=network_id&key=k&value=v", r.URL.Path))
                logger.Error("update entry failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        tname := r.Form["tname"][0]
        nid := r.Form["nid"][0]
        key := r.Form["key"][0]
        value := r.Form["value"][0]
        decodedValue := []byte(value)
        if !unsafe {
                var err error
                decodedValue, err = base64.StdEncoding.DecodeString(value)
                if err != nil {
                        logger.WithError(err).Error("update entry failed")
                        diagnostic.HTTPReply(w, diagnostic.FailCommand(err), json)
                        return
                }
        }

        if err := nDB.UpdateEntry(tname, nid, key, decodedValue); err != nil {
                logger.WithError(err).Error("update entry failed")
                diagnostic.HTTPReply(w, diagnostic.FailCommand(err), json)
                return
        }
        logger.Info("update entry done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(nil), json)
}

func (nDB *NetworkDB) dbDeleteEntry(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        _, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("delete entry")

        if len(r.Form["tname"]) < 1 ||
                len(r.Form["nid"]) < 1 ||
                len(r.Form["key"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?tname=table_name&nid=network_id&key=k", r.URL.Path))
                logger.Error("delete entry failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        tname := r.Form["tname"][0]
        nid := r.Form["nid"][0]
        key := r.Form["key"][0]

        err := nDB.DeleteEntry(tname, nid, key)
        if err != nil {
                logger.WithError(err).Error("delete entry failed")
                diagnostic.HTTPReply(w, diagnostic.FailCommand(err), json)
                return
        }
        logger.Info("delete entry done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(nil), json)
}

func (nDB *NetworkDB) dbGetEntry(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        unsafe, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("get entry")

        if len(r.Form["tname"]) < 1 ||
                len(r.Form["nid"]) < 1 ||
                len(r.Form["key"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?tname=table_name&nid=network_id&key=k", r.URL.Path))
                logger.Error("get entry failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        tname := r.Form["tname"][0]
        nid := r.Form["nid"][0]
        key := r.Form["key"][0]

        value, err := nDB.GetEntry(tname, nid, key)
        if err != nil {
                logger.WithError(err).Error("get entry failed")
                diagnostic.HTTPReply(w, diagnostic.FailCommand(err), json)
                return
        }

        var encodedValue string
        if unsafe {
                encodedValue = string(value)
        } else {
                encodedValue = base64.StdEncoding.EncodeToString(value)
        }

        rsp := &diagnostic.TableEntryObj{Key: key, Value: encodedValue}
        logger.WithField("response", fmt.Sprintf("%+v", rsp)).Info("get entry done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(rsp), json)
}

func (nDB *NetworkDB) dbJoinNetwork(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        _, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("join network")

        if len(r.Form["nid"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?nid=network_id", r.URL.Path))
                logger.Error("join network failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        nid := r.Form["nid"][0]

        if err := nDB.JoinNetwork(nid); err != nil {
                logger.WithError(err).Error("join network failed")
                diagnostic.HTTPReply(w, diagnostic.FailCommand(err), json)
                return
        }
        logger.Info("join network done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(nil), json)
}

func (nDB *NetworkDB) dbLeaveNetwork(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        _, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("leave network")

        if len(r.Form["nid"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?nid=network_id", r.URL.Path))
                logger.Error("leave network failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        nid := r.Form["nid"][0]

        if err := nDB.LeaveNetwork(nid); err != nil {
                logger.WithError(err).Error("leave network failed")
                diagnostic.HTTPReply(w, diagnostic.FailCommand(err), json)
                return
        }
        logger.Info("leave network done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(nil), json)
}

func (nDB *NetworkDB) dbGetTable(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        unsafe, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("get table")

        if len(r.Form["tname"]) < 1 ||
                len(r.Form["nid"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?tname=table_name&nid=network_id", r.URL.Path))
                logger.Error("get table failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        tname := r.Form["tname"][0]
        nid := r.Form["nid"][0]

        table := nDB.GetTableByNetwork(tname, nid)
        rsp := &diagnostic.TableObj{Length: len(table)}
        i := 0
        for k, v := range table {
                var encodedValue string
                if unsafe {
                        encodedValue = string(v.Value)
                } else {
                        encodedValue = base64.StdEncoding.EncodeToString(v.Value)
                }
                rsp.Elements = append(rsp.Elements,
                        &diagnostic.TableEntryObj{
                                Index: i,
                                Key:   k,
                                Value: encodedValue,
                                Owner: v.owner,
                        })
                i++
        }
        logger.WithField("response", fmt.Sprintf("%+v", rsp)).Info("get table done")
        diagnostic.HTTPReply(w, diagnostic.CommandSucceed(rsp), json)
}

func (nDB *NetworkDB) dbNetworkStats(w http.ResponseWriter, r *http.Request) {
        _ = r.ParseForm()
        diagnostic.DebugHTTPForm(r)
        _, json := diagnostic.ParseHTTPFormOptions(r)

        // audit logs
        logger := log.G(context.TODO()).WithFields(log.Fields{
                "component": "diagnostic",
                "remoteIP":  r.RemoteAddr,
                "method":    caller.Name(0),
                "url":       r.URL.String(),
        })
        logger.Info("network stats")

        if len(r.Form["nid"]) < 1 {
                rsp := diagnostic.WrongCommand(missingParameter, fmt.Sprintf("%s?nid=test", r.URL.Path))
                logger.Error("network stats failed, wrong input")
                diagnostic.HTTPReply(w, rsp, json)
                return
        }

        nDB.RLock()
        network, ok := nDB.thisNodeNetworks[r.Form["nid"][0]]

        entries := -1
        qLen := -1
        if ok {
                entries = int(network.entriesNumber.Load())
                qLen = network.tableBroadcasts.NumQueued()
        }
        nDB.RUnlock()

        rsp := diagnostic.CommandSucceed(&diagnostic.NetworkStatsResult{Entries: entries, QueueLen: qLen})
        logger.WithField("response", fmt.Sprintf("%+v", rsp)).Info("network stats done")
        diagnostic.HTTPReply(w, rsp, json)
}

package networkdb

import (
        "context"
        "fmt"

        "github.com/containerd/log"
        "github.com/hashicorp/memberlist"
)

type nodeState int

const (
        nodeNotFound    nodeState = -1
        nodeActiveState nodeState = 0
        nodeLeftState   nodeState = 1
        nodeFailedState nodeState = 2
)

var nodeStateName = map[nodeState]string{
        -1: "NodeNotFound",
        0:  "NodeActive",
        1:  "NodeLeft",
        2:  "NodeFailed",
}

// findNode search the node into the 3 node lists and returns the node pointer and the list
// where it got found
func (nDB *NetworkDB) findNode(nodeName string) (*node, nodeState, map[string]*node) {
        for i, nodes := range []map[string]*node{
                nDB.nodes,
                nDB.leftNodes,
                nDB.failedNodes,
        } {
                if n, ok := nodes[nodeName]; ok {
                        return n, nodeState(i), nodes
                }
        }
        return nil, nodeNotFound, nil
}

// changeNodeState changes the state of the node specified, returns true if the node was moved,
// false if there was no need to change the node state. Error will be returned if the node does not
// exists
func (nDB *NetworkDB) changeNodeState(nodeName string, newState nodeState) (bool, error) {
        n, currState, m := nDB.findNode(nodeName)
        if n == nil {
                return false, fmt.Errorf("node %s not found", nodeName)
        }

        switch newState {
        case nodeActiveState:
                if currState == nodeActiveState {
                        return false, nil
                }

                delete(m, nodeName)
                // reset the node reap time
                n.reapTime = 0
                nDB.nodes[nodeName] = n
        case nodeLeftState:
                if currState == nodeLeftState {
                        return false, nil
                }

                delete(m, nodeName)
                nDB.leftNodes[nodeName] = n
        case nodeFailedState:
                if currState == nodeFailedState {
                        return false, nil
                }

                delete(m, nodeName)
                nDB.failedNodes[nodeName] = n
        default:
                // TODO(thaJeztah): make switch exhaustive; add networkdb.nodeNotFound
        }

        nDB.estNodes.Store(int32(len(nDB.nodes)))

        log.G(context.TODO()).Infof("Node %s change state %s --> %s", nodeName, nodeStateName[currState], nodeStateName[newState])

        if newState == nodeLeftState || newState == nodeFailedState {
                // set the node reap time, if not already set
                // It is possible that a node passes from failed to left and the reaptime was already set so keep that value
                if n.reapTime == 0 {
                        n.reapTime = nodeReapInterval
                }
                // The node leave or fails, delete all the entries created by it.
                // If the node was temporary down, deleting the entries will guarantee that the CREATE events will be accepted
                // If the node instead left because was going down, then it makes sense to just delete all its state
                nDB.deleteNodeFromNetworks(n.Name)
                nDB.deleteNodeTableEntries(n.Name)
        }

        return true, nil
}

func (nDB *NetworkDB) purgeReincarnation(mn *memberlist.Node) bool {
        for name, node := range nDB.nodes {
                if node.Addr.Equal(mn.Addr) && node.Port == mn.Port && mn.Name != name {
                        log.G(context.TODO()).Infof("Node %s/%s, is the new incarnation of the active node %s/%s", mn.Name, mn.Addr, name, node.Addr)
                        nDB.changeNodeState(name, nodeLeftState)
                        return true
                }
        }

        for name, node := range nDB.failedNodes {
                if node.Addr.Equal(mn.Addr) && node.Port == mn.Port && mn.Name != name {
                        log.G(context.TODO()).Infof("Node %s/%s, is the new incarnation of the failed node %s/%s", mn.Name, mn.Addr, name, node.Addr)
                        nDB.changeNodeState(name, nodeLeftState)
                        return true
                }
        }

        for name, node := range nDB.leftNodes {
                if node.Addr.Equal(mn.Addr) && node.Port == mn.Port && mn.Name != name {
                        log.G(context.TODO()).Infof("Node %s/%s, is the new incarnation of the shutdown node %s/%s", mn.Name, mn.Addr, name, node.Addr)
                        nDB.changeNodeState(name, nodeLeftState)
                        return true
                }
        }

        return false
}

func (nDB *NetworkDB) estNumNodes() int {
        return int(nDB.estNodes.Load())
}

package networkdb

import (
        "net"
        "strings"

        "github.com/docker/go-events"
)

type opType uint8

const (
        opCreate opType = 1 + iota
        opUpdate
        opDelete
)

type event struct {
        Table     string
        NetworkID string
        Key       string
        Value     []byte
}

// NodeTable represents table event for node join and leave
const NodeTable = "NodeTable"

// NodeAddr represents the value carried for node event in NodeTable
type NodeAddr struct {
        Addr net.IP
}

// CreateEvent generates a table entry create event to the watchers
type CreateEvent event

// UpdateEvent generates a table entry update event to the watchers
type UpdateEvent event

// DeleteEvent generates a table entry delete event to the watchers
type DeleteEvent event

// Watch creates a watcher with filters for a particular table or
// network or any combination of the tuple. If any of the
// filter is an empty string it acts as a wildcard for that
// field. Watch returns a channel of events, where the events will be
// sent. The watch channel is initialized with synthetic create events for all
// the existing table entries not owned by this node which match the filters.
func (nDB *NetworkDB) Watch(tname, nid string) (*events.Channel, func()) {
        var matcher events.Matcher

        if tname != "" || nid != "" {
                matcher = events.MatcherFunc(func(ev events.Event) bool {
                        var evt event
                        switch ev := ev.(type) {
                        case CreateEvent:
                                evt = event(ev)
                        case UpdateEvent:
                                evt = event(ev)
                        case DeleteEvent:
                                evt = event(ev)
                        }

                        if tname != "" && evt.Table != tname {
                                return false
                        }

                        if nid != "" && evt.NetworkID != nid {
                                return false
                        }

                        return true
                })
        }

        ch := events.NewChannel(0)
        sink := events.Sink(events.NewQueue(ch))

        if matcher != nil {
                sink = events.NewFilter(sink, matcher)
        }

        // Synthesize events for all the existing table entries not owned by
        // this node so that the watcher receives all state without racing with
        // any concurrent mutations to the table.
        nDB.RLock()
        defer nDB.RUnlock()
        if tname == "" {
                var prefix []byte
                if nid != "" {
                        prefix = []byte("/" + nid + "/")
                } else {
                        prefix = []byte("/")
                }
                nDB.indexes[byNetwork].Root().WalkPrefix(prefix, func(path []byte, v *entry) bool {
                        if !v.deleting && v.node != nDB.config.NodeID {
                                tuple := strings.SplitN(string(path[1:]), "/", 3)
                                if len(tuple) == 3 {
                                        entryNid, entryTname, key := tuple[0], tuple[1], tuple[2]
                                        sink.Write(makeEvent(opCreate, entryTname, entryNid, key, v.value))
                                }
                        }
                        return false
                })
        } else {
                prefix := []byte("/" + tname + "/")
                if nid != "" {
                        prefix = append(prefix, []byte(nid+"/")...)
                }
                nDB.indexes[byTable].Root().WalkPrefix(prefix, func(path []byte, v *entry) bool {
                        if !v.deleting && v.node != nDB.config.NodeID {
                                tuple := strings.SplitN(string(path[1:]), "/", 3)
                                if len(tuple) == 3 {
                                        entryTname, entryNid, key := tuple[0], tuple[1], tuple[2]
                                        sink.Write(makeEvent(opCreate, entryTname, entryNid, key, v.value))
                                }
                        }
                        return false
                })
        }

        nDB.broadcaster.Add(sink)
        return ch, func() {
                nDB.broadcaster.Remove(sink)
                ch.Close()
                sink.Close()
        }
}

func makeEvent(op opType, tname, nid, key string, value []byte) events.Event {
        ev := event{
                Table:     tname,
                NetworkID: nid,
                Key:       key,
                Value:     value,
        }

        switch op {
        case opCreate:
                return CreateEvent(ev)
        case opUpdate:
                return UpdateEvent(ev)
        case opDelete:
                return DeleteEvent(ev)
        }

        return nil
}

package ns

import (
        "context"
        "sync"
        "syscall"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/modprobe"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/vishvananda/netns"
)

var (
        initNs   netns.NsHandle
        initNl   nlwrap.Handle
        initOnce sync.Once
        // NetlinkSocketsTimeout represents the default timeout duration for the sockets
        NetlinkSocketsTimeout = 3 * time.Second
)

// Init initializes a new network namespace
func Init() {
        var err error
        initNs, err = netns.Get()
        if err != nil {
                log.G(context.TODO()).Errorf("could not get initial namespace: %v", err)
        }
        initNl, err = nlwrap.NewHandle(getSupportedNlFamilies()...)
        if err != nil {
                log.G(context.TODO()).Errorf("could not create netlink handle on initial namespace: %v", err)
        }
        err = initNl.SetSocketTimeout(NetlinkSocketsTimeout)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to set the timeout on the default netlink handle sockets: %v", err)
        }
}

// ParseHandlerInt transforms the namespace handler into an integer
func ParseHandlerInt() int {
        return int(getHandler())
}

// GetHandler returns the namespace handler
func getHandler() netns.NsHandle {
        initOnce.Do(Init)
        return initNs
}

// NlHandle returns the netlink handler
func NlHandle() nlwrap.Handle {
        initOnce.Do(Init)
        return initNl
}

func getSupportedNlFamilies() []int {
        fams := []int{syscall.NETLINK_ROUTE}
        // NETLINK_XFRM test
        if err := checkXfrmSocket(); err != nil {
                log.G(context.TODO()).Warnf("Could not load necessary modules for IPSEC rules: %v", err)
        } else {
                fams = append(fams, syscall.NETLINK_XFRM)
        }
        // NETLINK_NETFILTER test
        if err := modprobe.LoadModules(context.TODO(), checkNfSocket, "nf_conntrack", "nf_conntrack_netlink"); err != nil {
                log.G(context.TODO()).Warnf("Could not load necessary modules for Conntrack: %v", err)
        } else {
                fams = append(fams, syscall.NETLINK_NETFILTER)
        }

        return fams
}

// API check on required xfrm modules (xfrm_user, xfrm_algo)
func checkXfrmSocket() error {
        fd, err := syscall.Socket(syscall.AF_NETLINK, syscall.SOCK_RAW, syscall.NETLINK_XFRM)
        if err != nil {
                return err
        }
        syscall.Close(fd)
        return nil
}

// API check on required nf_conntrack* modules (nf_conntrack, nf_conntrack_netlink)
func checkNfSocket() error {
        fd, err := syscall.Socket(syscall.AF_NETLINK, syscall.SOCK_RAW, syscall.NETLINK_NETFILTER)
        if err != nil {
                return err
        }
        syscall.Close(fd)
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

// Package options provides a way to pass unstructured sets of options to a
// component expecting a strongly-typed configuration structure.
package options

import (
        "fmt"
        "reflect"
)

// NoSuchFieldError is the error returned when the generic parameters hold a
// value for a field absent from the destination structure.
type NoSuchFieldError struct {
        Field string
        Type  string
}

func (e NoSuchFieldError) Error() string {
        return fmt.Sprintf("no field %q in type %q", e.Field, e.Type)
}

// CannotSetFieldError is the error returned when the generic parameters hold a
// value for a field that cannot be set in the destination structure.
type CannotSetFieldError struct {
        Field string
        Type  string
}

func (e CannotSetFieldError) Error() string {
        return fmt.Sprintf("cannot set field %q of type %q", e.Field, e.Type)
}

// TypeMismatchError is the error returned when the type of the generic value
// for a field mismatches the type of the destination structure.
type TypeMismatchError struct {
        Field      string
        ExpectType string
        ActualType string
}

func (e TypeMismatchError) Error() string {
        return fmt.Sprintf("type mismatch, field %s require type %v, actual type %v", e.Field, e.ExpectType, e.ActualType)
}

// Generic is a basic type to store arbitrary settings.
type Generic map[string]any

// GenerateFromModel takes the generic options, and tries to build a new
// instance of the model's type by matching keys from the generic options to
// fields in the model.
//
// The return value is of the same type than the model (including a potential
// pointer qualifier).
func GenerateFromModel(options Generic, model any) (any, error) {
        modType := reflect.TypeOf(model)

        // If the model is of pointer type, we need to dereference for New.
        resType := reflect.TypeOf(model)
        if modType.Kind() == reflect.Ptr {
                resType = resType.Elem()
        }

        // Populate the result structure with the generic layout content.
        res := reflect.New(resType)
        for name, value := range options {
                field := res.Elem().FieldByName(name)
                if !field.IsValid() {
                        return nil, NoSuchFieldError{name, resType.String()}
                }
                if !field.CanSet() {
                        return nil, CannotSetFieldError{name, resType.String()}
                }
                if reflect.TypeOf(value) != field.Type() {
                        return nil, TypeMismatchError{name, field.Type().String(), reflect.TypeOf(value).String()}
                }
                field.Set(reflect.ValueOf(value))
        }

        // If the model is not of pointer type, return content of the result.
        if modType.Kind() == reflect.Ptr {
                return res.Interface(), nil
        }
        return res.Elem().Interface(), nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package osl

import (
        "bytes"
        "context"
        "errors"
        "fmt"
        "net"
        "os"
        "path/filepath"
        "slices"
        "sort"
        "strconv"
        "strings"
        "syscall"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/l2disco"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/vishvananda/netlink"
        "github.com/vishvananda/netns"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
        "golang.org/x/sys/unix"
)

const (
        // AdvertiseAddrNMsgsMin defines the minimum number of ARP/NA messages sent when an
        // interface is configured.
        // Zero can be used to disable unsolicited ARP/NA.
        AdvertiseAddrNMsgsMin = 0
        // AdvertiseAddrNMsgsMax defines the maximum number of ARP/NA messages sent when an
        // interface is configured. It's three, to match RFC-5227 Section 1.1
        //        // ("PROBE_NUM=3") and RFC-4861 MAX_NEIGHBOR_ADVERTISEMENT.
        AdvertiseAddrNMsgsMax = 3
        // advertiseAddrNMsgsDefault is the default number of ARP/NA messages sent when
        // an interface is configured.
        advertiseAddrNMsgsDefault = 3

        // AdvertiseAddrIntervalMin defines the minimum interval between ARP/NA messages
        // sent when an interface is configured. The min defined here is nonstandard,
        // RFC-5227 PROBE_MIN and the default for RetransTimer in RFC-4861 are one
        // second. But, faster resends may be useful in a bridge network (where packets
        // are not transmitted on a real network).
        AdvertiseAddrIntervalMin = 100 * time.Millisecond
        // AdvertiseAddrIntervalMax defines the maximum interval between ARP/NA messages
        // sent when an interface is configured. The max of 2s matches RFC-5227
        // PROBE_MAX.
        AdvertiseAddrIntervalMax = 2 * time.Second
        // advertiseAddrIntervalDefault is the default interval between ARP/NA messages
        // sent when and interface is configured.
        // One second matches RFC-5227 PROBE_MIN and the default for RetransTimer in RFC-4861.
        advertiseAddrIntervalDefault = time.Second
)

// newInterface creates a new interface in the given namespace using the
// provided options.
func newInterface(ns *Namespace, srcName, dstPrefix, dstName string, options ...IfaceOption) (*Interface, error) {
        i := &Interface{
                stopCh:                make(chan struct{}),
                srcName:               srcName,
                dstPrefix:             dstPrefix,
                dstName:               dstName,
                advertiseAddrNMsgs:    advertiseAddrNMsgsDefault,
                advertiseAddrInterval: advertiseAddrIntervalDefault,
                ns:                    ns,
        }
        for _, opt := range options {
                if opt != nil {
                        // TODO(thaJeztah): use multi-error instead of returning early.
                        if err := opt(i); err != nil {
                                return nil, err
                        }
                }
        }
        if i.master != "" {
                i.dstMaster = ns.findDst(i.master, true)
                if i.dstMaster == "" {
                        return nil, fmt.Errorf("could not find an appropriate master %q for %q", i.master, i.srcName)
                }
        }
        return i, nil
}

// Interface represents the settings and identity of a network device.
// It is used as a return type for Network.Link, and it is common practice
// for the caller to use this information when moving interface SrcName from
// host namespace to DstName in a different net namespace with the appropriate
// network settings.
type Interface struct {
        stopCh      chan struct{} // stopCh is closed before the interface is deleted.
        srcName     string
        dstPrefix   string
        dstName     string
        master      string
        dstMaster   string
        mac         net.HardwareAddr
        address     *net.IPNet
        addressIPv6 *net.IPNet
        llAddrs     []*net.IPNet
        routes      []*net.IPNet
        bridge      bool
        sysctls     []string
        // advertiseAddrNMsgs is the number of unsolicited ARP/NA messages that will be sent to
        // advertise the interface's addresses. No messages will be sent if this is zero.
        advertiseAddrNMsgs int
        // advertiseAddrInterval is the interval between unsolicited ARP/NA messages sent to
        // advertise the interface's addresses.
        advertiseAddrInterval time.Duration
        createdInContainer    bool
        ns                    *Namespace
}

// SrcName returns the name of the interface in the origin network namespace.
func (i *Interface) SrcName() string {
        return i.srcName
}

// DstName returns the final interface name in the target network namespace.
// It's generated based on the prefix passed to [Namespace.AddInterface].
func (i *Interface) DstName() string {
        return i.dstName
}

func (i *Interface) DstMaster() string {
        return i.dstMaster
}

// Bridge returns true if the interface is a bridge.
func (i *Interface) Bridge() bool {
        return i.bridge
}

func (i *Interface) MacAddress() net.HardwareAddr {
        return types.GetMacCopy(i.mac)
}

// Address returns the IPv4 address for the interface.
func (i *Interface) Address() *net.IPNet {
        return types.GetIPNetCopy(i.address)
}

// AddressIPv6 returns the IPv6 address for the interface.
func (i *Interface) AddressIPv6() *net.IPNet {
        return types.GetIPNetCopy(i.addressIPv6)
}

// LinkLocalAddresses returns the link-local IP addresses assigned to the
// interface.
func (i *Interface) LinkLocalAddresses() []*net.IPNet {
        return i.llAddrs
}

// Routes returns IP routes for the interface.
func (i *Interface) Routes() []*net.IPNet {
        routes := make([]*net.IPNet, len(i.routes))
        for index, route := range i.routes {
                routes[index] = types.GetIPNetCopy(route)
        }

        return routes
}

// Remove an interface from the sandbox by renaming to original name
// and moving it out of the sandbox.
func (i *Interface) Remove() error {
        nameSpace := i.ns
        return nameSpace.RemoveInterface(i)
}

// Statistics returns the sandbox's side veth interface statistics.
func (i *Interface) Statistics() (*types.InterfaceStatistics, error) {
        l, err := i.ns.nlHandle.LinkByName(i.DstName())
        if err != nil {
                return nil, fmt.Errorf("failed to retrieve the statistics for %s in netns %s: %v", i.DstName(), i.ns.path, err)
        }

        stats := l.Attrs().Statistics
        if stats == nil {
                return nil, errors.New("no statistics were returned")
        }

        return &types.InterfaceStatistics{
                RxBytes:   stats.RxBytes,
                TxBytes:   stats.TxBytes,
                RxPackets: stats.RxPackets,
                TxPackets: stats.TxPackets,
                RxDropped: stats.RxDropped,
                TxDropped: stats.TxDropped,
        }, nil
}

func (n *Namespace) findDst(srcName string, isBridge bool) string {
        n.mu.Lock()
        defer n.mu.Unlock()

        for _, i := range n.iFaces {
                // The master should match the srcname of the interface and the
                // master interface should be of type bridge, if searching for a bridge type
                if i.SrcName() == srcName && (!isBridge || i.Bridge()) {
                        return i.DstName()
                }
        }

        return ""
}

func moveLink(ctx context.Context, nlhHost nlwrap.Handle, iface netlink.Link, i *Interface, nsh netns.NsHandle) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.moveLink", trace.WithAttributes(
                attribute.String("ifaceName", i.DstName())))
        defer span.End()

        if err := nlhHost.LinkSetNsFd(iface, int(nsh)); err != nil {
                return fmt.Errorf("failed to set namespace on link %q: %v", i.srcName, err)
        }
        return nil
}

// AddInterface creates an Interface that represents an existing network
// interface (except for bridge interfaces, which are created here).
//
// The network interface will be reconfigured according the options passed, and
// it'll be renamed from srcName into either dstName if it's not empty, or to
// an auto-generated dest name that combines the provided dstPrefix and a
// numeric suffix.
//
// It's safe to call concurrently.
func (n *Namespace) AddInterface(ctx context.Context, srcName, dstPrefix, dstName string, options ...IfaceOption) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.AddInterface", trace.WithAttributes(
                attribute.String("srcName", srcName),
                attribute.String("dstPrefix", dstPrefix)))
        defer span.End()

        newNs := netns.None()
        if !n.isDefault {
                var err error
                newNs, err = netns.GetFromPath(n.path)
                if err != nil {
                        return fmt.Errorf("failed get network namespace %q: %v", n.path, err)
                }
                defer newNs.Close()
        }

        i, iface, err := n.createInterface(ctx, newNs, srcName, dstPrefix, dstName, options...)
        if err != nil {
                return err
        }

        // Configure the interface now this is moved in the proper namespace.
        if err := n.configureInterface(ctx, n.nlHandle, iface, i); err != nil {
                // If configuring the device fails move it back to the host namespace
                // and change the name back to the source name. This allows the caller
                // to properly cleanup the interface. Its important especially for
                // interfaces with global attributes, ex: vni id for vxlan interfaces.
                if nerr := n.nlHandle.LinkSetName(iface, i.SrcName()); nerr != nil {
                        log.G(ctx).Errorf("renaming interface (%s->%s) failed, %v after config error %v", i.DstName(), i.SrcName(), nerr, err)
                }
                if nerr := n.nlHandle.LinkSetNsFd(iface, ns.ParseHandlerInt()); nerr != nil {
                        log.G(ctx).Errorf("moving interface %s to host ns failed, %v, after config error %v", i.SrcName(), nerr, err)
                }
                return err
        }

        // Up the interface.
        cnt := 0
        for err = n.nlHandle.LinkSetUp(iface); err != nil && cnt < 3; cnt++ {
                ctx, span2 := otel.Tracer("").Start(ctx, "libnetwork.osl.retryingLinkUp", trace.WithAttributes(
                        attribute.String("srcName", srcName),
                        attribute.String("dstPrefix", dstPrefix)))
                defer span2.End()
                log.G(ctx).Debugf("retrying link setup because of: %v", err)
                time.Sleep(10 * time.Millisecond)
                err = n.nlHandle.LinkSetUp(iface)
        }
        if err != nil {
                return fmt.Errorf("failed to set link up: %v", err)
        }
        log.G(ctx).Debug("link has been set to up")

        // Set the routes on the interface. This can only be done when the interface is up.
        if err := setInterfaceRoutes(ctx, n.nlHandle, iface, i); err != nil {
                return fmt.Errorf("error setting interface %q routes to %q: %v", iface.Attrs().Name, i.Routes(), err)
        }

        // Wait for the interface to be up and running (or a timeout).
        up, err := waitForIfUpped(ctx, newNs, iface.Attrs().Index)
        if err != nil {
                return err
        }

        // If the interface is up, send unsolicited ARP/NA messages if necessary.
        if up {
                waitForBridgePort(ctx, ns.NlHandle(), iface)
                mcastRouteOk := waitForMcastRoute(ctx, iface.Attrs().Index, i, n.nlHandle)
                if err := n.advertiseAddrs(ctx, iface.Attrs().Index, i, n.nlHandle, mcastRouteOk); err != nil {
                        return fmt.Errorf("failed to advertise addresses: %w", err)
                }
        }

        return nil
}

// createInterface creates a new Interface, moves the underlying link into the
// target network namespace (if needed), and adds the interface to [Namespace.iFaces].
//
// If dstName is empty, createInterface will generate a unique suffix and
// append it to dstPrefix.
//
// It's safe to call concurrently.
func (n *Namespace) createInterface(ctx context.Context, targetNs netns.NsHandle, srcName, dstPrefix, dstName string, options ...IfaceOption) (*Interface, netlink.Link, error) {
        i, err := newInterface(n, srcName, dstPrefix, dstName, options...)
        if err != nil {
                return nil, nil, err
        }

        // It is not safe to call generateIfaceName and createInterface
        // concurrently, so the Namespace need to be locked until the interface
        // is added to n.iFaces.
        n.mu.Lock()
        defer n.mu.Unlock()

        if n.isDefault {
                i.dstName = i.srcName
        } else if i.dstName == "" {
                i.dstName = n.generateIfaceName(dstPrefix)
        }

        nlhHost := ns.NlHandle()

        // If it is a bridge interface we have to create the bridge inside
        // the namespace so don't try to lookup the interface using srcName
        if i.bridge {
                if err := n.nlHandle.LinkAdd(&netlink.Bridge{
                        LinkAttrs: netlink.LinkAttrs{
                                Name: i.srcName,
                        },
                }); err != nil {
                        return nil, nil, fmt.Errorf("failed to create bridge %q: %v", i.srcName, err)
                }
        } else if !i.createdInContainer {
                // Find the network interface identified by the SrcName attribute.
                iface, err := nlhHost.LinkByName(i.srcName)
                if err != nil {
                        return nil, nil, fmt.Errorf("failed to get link by name %q: %v", i.srcName, err)
                }

                // Move the network interface to the destination
                // namespace only if the namespace is not a default
                // type
                if !n.isDefault {
                        if err := moveLink(ctx, nlhHost, iface, i, targetNs); err != nil {
                                return nil, nil, err
                        }
                }
        }

        // Find the network interface identified by the SrcName attribute.
        iface, err := n.nlHandle.LinkByName(i.srcName)
        if err != nil {
                return nil, nil, fmt.Errorf("failed to get link by name %q: %v", i.srcName, err)
        }

        // Down the interface before configuring
        if err := n.nlHandle.LinkSetDown(iface); err != nil {
                return nil, nil, fmt.Errorf("failed to set link down: %v", err)
        }

        if err := setInterfaceName(ctx, n.nlHandle, iface, i); err != nil {
                return nil, nil, fmt.Errorf("error renaming interface %q to %q: %w", iface.Attrs().Name, i.DstName(), err)
        }

        n.iFaces = append(n.iFaces, i)

        return i, iface, nil
}

func (n *Namespace) generateIfaceName(prefix string) string {
        var suffixes []int
        for _, i := range n.iFaces {
                if s, ok := strings.CutPrefix(i.DstName(), prefix); ok {
                        // Ignore non-numerical prefixes and negative suffixes (they're
                        // treated as a different prefix).
                        if v, err := strconv.Atoi(s); err == nil && v >= 0 && s != "-0" {
                                suffixes = append(suffixes, v)
                        }
                }
        }

        sort.Ints(suffixes)

        // There are gaps in the numbering; find the first unused number.
        //
        // An alternative implementation could be to look at the highest suffix,
        // and increment it. But, if that incremented number makes the interface
        // name overflow the IFNAMSIZ limit (= 16 chars), the kernel would reject
        // that interface name while there are other unused numbers. So, instead
        // use the lowest suffix available.
        for i := 0; i < len(suffixes); i++ {
                if i != suffixes[i] {
                        return prefix + strconv.Itoa(i)
                }
        }

        return prefix + strconv.Itoa(len(suffixes))
}

func waitForIfUpped(ctx context.Context, ns netns.NsHandle, ifIndex int) (bool, error) {
        ctx, span := otel.Tracer("").Start(context.WithoutCancel(ctx), "libnetwork.osl.waitforIfUpped")
        defer span.End()

        update := make(chan netlink.LinkUpdate, 100)
        upped := make(chan struct{})
        opts := netlink.LinkSubscribeOptions{
                ListExisting: true, // in case the link is already up
                ErrorCallback: func(err error) {
                        select {
                        case <-upped:
                                // Ignore errors sent after the upped channel is closed, the netlink
                                // package sends an EAGAIN after it closes its netlink socket when it
                                // sees this channel is closed. (No message is ever sent on upped.)
                                return
                        default:
                        }
                        log.G(ctx).WithFields(log.Fields{
                                "ifi":   ifIndex,
                                "error": err,
                        }).Info("netlink error while waiting for interface up")
                },
        }
        if ns.IsOpen() {
                opts.Namespace = &ns
        }
        if err := nlwrap.LinkSubscribeWithOptions(update, upped, opts); err != nil {
                return false, fmt.Errorf("failed to subscribe to link updates: %w", err)
        }

        // When done (interface upped, or timeout), stop the LinkSubscribe and drain
        // the result channel. If the result channel isn't closed after a timeout,
        // log a warning to note the goroutine leak.
        defer func() {
                close(upped)
                drainTimerC := time.After(3 * time.Second)
                for {
                        select {
                        case _, ok := <-update:
                                if !ok {
                                        return
                                }
                        case <-drainTimerC:
                                log.G(ctx).Warn("timeout while waiting for LinkSubscribe to terminate")
                        }
                }
        }()

        timerC := time.After(5 * time.Second)
        for {
                select {
                case <-timerC:
                        log.G(ctx).Warnf("timeout in waitForIfUpped")
                        return false, nil
                case u, ok := <-update:
                        if !ok {
                                // The netlink package failed to read from its netlink socket. It will
                                // already have called the ErrorCallback, so the issue has been logged.
                                return false, nil
                        }
                        if u.Attrs().Index != ifIndex {
                                continue
                        }
                        log.G(ctx).WithFields(log.Fields{
                                "iface": u.Attrs().Name,
                                "ifi":   u.Attrs().Index,
                                "flags": deviceFlags(u.Flags),
                        }).Debug("link update")
                        if u.Flags&unix.IFF_UP == unix.IFF_UP {
                                return true, nil
                        }
                }
        }
}

// waitForBridgePort checks whether link iface is a veth. If it is, and the other
// end of the veth is slaved to a bridge, waits for up to maxWait for the bridge
// port's state to be "forwarding". If STP is enabled on the bridge, it doesn't
// wait. If the port is still not forwarding when this returns, at-least the
// first unsolicited ARP/NA packets may be dropped.
func waitForBridgePort(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link) {
        if iface.Type() != "veth" {
                return
        }
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.waitForBridgePort")
        defer span.End()
        ctx = log.WithLogger(ctx, log.G(ctx).WithField("veth", iface.Attrs().Name))

        // The parent of a veth is the other end of the veth.
        parentIndex := iface.Attrs().ParentIndex
        if parentIndex <= 0 {
                log.G(ctx).Debug("veth has no parent index")
                return
        }
        parentIface, err := nlh.LinkByIndex(parentIndex)
        if err != nil {
                // The parent isn't in the host's netns, it's probably in a swarm load-balancer
                // sandbox, and we don't know where that is. But, swarm still uses IP-based MAC
                // addresses so the unsolicited ARPs aren't essential. If the first one goes
                // missing because the bridge's port isn't forwarding yet, it's ok.
                log.G(ctx).WithFields(log.Fields{"parentIndex": parentIndex, "error": err}).Debug("No parent interface")
                return
        }
        // If the other end of the veth has a MasterIndex, that's a bridge.
        if parentIface.Attrs().MasterIndex <= 0 {
                log.G(ctx).Debug("veth is not connected to a bridge")
                return
        }
        bridgeIface, err := nlh.LinkByIndex(parentIface.Attrs().MasterIndex)
        if err != nil {
                log.G(ctx).WithFields(log.Fields{
                        "parentIndex": parentIndex,
                        "masterIndex": parentIface.Attrs().MasterIndex,
                        "error":       err,
                }).Warn("No parent bridge link")
                return
        }

        // Ideally, we'd read the port state via netlink. But, vishvananda/netlink needs a
        // patch to include state in its response.
        // - type Protinfo needs a "State uint8"
        // - parseProtinfo() needs "case nl.IFLA_BRPORT_STATE: pi.State = uint8(info.Value[0])"
        /*
                pi, err := nlh.LinkGetProtinfo(parentIface)
                if err != nil {
                        return fmt.Errorf("get bridge protinfo: %w", err)
                }
        */

        // Check that STP is not enabled on the bridge. It won't be enabled on a
        // bridge network's own bridge. But, could be on a user-supplied bridge
        // and, if it is, it won't be forwarding within the timeout here.
        if stpEnabled(ctx, bridgeIface.Attrs().Name) {
                log.G(ctx).Info("STP is enabled, not waiting for port to be forwarding")
                return
        }

        // Read the port state from "/sys/class/net/<bridge>/brif/<veth>/state".
        var portStateFile *os.File
        path := filepath.Join("/sys/class/net", bridgeIface.Attrs().Name, "brif", parentIface.Attrs().Name, "state")
        portStateFile, err = os.Open(path)
        if err != nil {
                // In integration tests where the daemon is running in its own netns, the bridge
                // device isn't visible in "/sys/class/net". So, just wait for hopefully-long-enough
                // for the bridge's port to be ready.
                log.G(ctx).WithField("port", path).Warn("Failed to open port state file, waiting for 20ms")
                time.Sleep(20 * time.Millisecond)
                return
        }
        defer portStateFile.Close()

        // Poll the bridge port's state until it's "forwarding". (By now, it should be. So, poll
        // quickly, and not for long.)
        const pollInterval = 10 * time.Millisecond
        const maxWait = 200 * time.Millisecond
        var stateFileContent [2]byte
        for range int64(maxWait / pollInterval) {
                n, err := portStateFile.ReadAt(stateFileContent[:], 0)
                if err != nil {
                        log.G(ctx).WithFields(log.Fields{
                                "filename": path,
                                "error":    err,
                        }).Warn("Failed to read bridge port state")
                        return
                }
                if n == 0 {
                        log.G(ctx).WithField("filename", path).Warn("Empty bridge port state file")
                        return
                }
                // Forwarding is state '3'.
                // https://elixir.bootlin.com/linux/v6.13/source/include/uapi/linux/if_bridge.h#L49-L53
                if stateFileContent[0] != '3' {
                        log.G(ctx).WithField("portState", stateFileContent[0]).Debug("waiting for bridge port to be forwarding")
                        time.Sleep(pollInterval)
                        continue
                }
                log.G(ctx).Debug("Bridge port is forwarding")
                return
        }
        log.G(ctx).WithFields(log.Fields{
                "portState": stateFileContent[0],
                "waitTime":  maxWait,
        }).Warn("Bridge port not forwarding")
}

// stpEnabled returns true if "/sys/class/net/<name>/bridge/stp_state" can be read
// and does not contain "0".
func stpEnabled(ctx context.Context, name string) bool {
        stpStateFilename := filepath.Join("/sys/class/net", name, "bridge/stp_state")
        stpState, err := os.ReadFile(stpStateFilename)
        if err != nil {
                log.G(ctx).WithError(err).Warnf("Failed to read stp_state file %q", stpStateFilename)
                return false
        }
        return len(stpState) > 0 && stpState[0] != '0'
}

// waitForMcastRoute waits for an interface to have a route from ::1 to the IPv6 LL all-nodes
// address (ff02::1), if that route is needed to send a neighbour advertisement for an IPv6
// interface address.
//
// After waiting, or a failure, if there is no route - no error is returned. The NA send may
// fail, but try it anyway.
//
// In CI, the NA send failed with "write ip ::1->ff02::1: sendmsg: network is unreachable".
// That error has not been seen since addition of the check that the veth's parent bridge port
// is forwarding, so that may have been the issue. But, in case it's a timing problem that's
// only less-likely because of delay caused by that check, make sure the route exists.
func waitForMcastRoute(ctx context.Context, ifIndex int, i *Interface, nlh nlwrap.Handle) bool {
        if i.addressIPv6 == nil || i.advertiseAddrNMsgs == 0 {
                return true
        }
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.waitForMcastRoute")
        defer span.End()

        const pollInterval = 10 * time.Millisecond
        const maxWait = 200 * time.Millisecond
        for range int64(maxWait / pollInterval) {
                routes, err := nlh.RouteGetWithOptions(net.IPv6linklocalallnodes, &netlink.RouteGetOptions{
                        IifIndex: ifIndex,
                        SrcAddr:  net.IPv6loopback,
                })
                if errors.Is(err, unix.EMSGSIZE) {
                        // FIXME(robmry) - if EMSGSIZE is returned (why?), it seems to be persistent.
                        //  So, skip the delay and continue to the NA send as it seems to succeed.
                        log.G(ctx).Info("Skipping check for route to send NA, EMSGSIZE")
                        return true
                }
                if err != nil || len(routes) == 0 {
                        log.G(ctx).WithFields(log.Fields{"error": err, "nroutes": len(routes)}).Info("Waiting for route to send NA")
                        time.Sleep(pollInterval)
                        continue
                }
                return true
        }
        log.G(ctx).WithField("", maxWait).Warn("No route for neighbour advertisement")
        return false
}

// advertiseAddrs triggers send unsolicited ARP and Neighbour Advertisement
// messages, so that caches are updated with the MAC address currently associated
// with the interface's IP addresses.
//
// IP addresses are recycled quickly when endpoints are dropped on network
// disconnect or container stop. A new MAC address may have been generated, so
// this is necessary to avoid packets sent to the old MAC address getting dropped
// until the ARP/Neighbour cache entries expire.
//
// Note that the kernel's arp_notify sysctl setting is not respected.
func (n *Namespace) advertiseAddrs(ctx context.Context, ifIndex int, i *Interface, nlh nlwrap.Handle, mcastRouteOk bool) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.advertiseAddrs.initial")
        defer span.End()

        mac := i.MacAddress()
        address4 := i.Address()
        address6 := i.AddressIPv6()
        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                "iface":        i.dstName,
                "ifi":          ifIndex,
                "mac":          mac.String(),
                "ip4":          address4,
                "ip6":          address6,
                "mcastRouteOk": mcastRouteOk,
        }))

        if address4 == nil && address6 == nil {
                // Nothing to do - for example, a bridge with no configured addresses.
                log.G(ctx).Debug("No IP addresses to advertise")
                return nil
        }
        if mac == nil {
                // Nothing to do - for example, a layer-3 ipvlan.
                log.G(ctx).Debug("No MAC address to advertise")
                return nil
        }
        if i.advertiseAddrNMsgs == 0 {
                log.G(ctx).Debug("Unsolicited ARP/NA is disabled")
                return nil
        }

        arpSender, naSender := n.prepAdvertiseAddrs(ctx, i, ifIndex)
        if arpSender == nil && naSender == nil {
                return nil
        }
        cleanup := func() {
                if arpSender != nil {
                        arpSender.Close()
                }
                if naSender != nil {
                        naSender.Close()
                }
        }
        stillSending := false
        defer func() {
                if !stillSending {
                        cleanup()
                }
        }()

        send := func(ctx context.Context) error {
                link, err := nlh.LinkByIndex(ifIndex)
                if err != nil {
                        return fmt.Errorf("failed to refresh link attributes: %w", err)
                }
                if curMAC := link.Attrs().HardwareAddr; !bytes.Equal(curMAC, mac) {
                        log.G(ctx).WithFields(log.Fields{"newMAC": curMAC.String()}).Warn("MAC address changed")
                        return fmt.Errorf("MAC address changed, got %s, expected %s", curMAC, mac.String())
                }
                log.G(ctx).Debug("Sending unsolicited ARP/NA")
                var errs []error
                if arpSender != nil {
                        if err := arpSender.Send(); err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to send unsolicited ARP")
                                errs = append(errs, err)
                        }
                }
                if naSender != nil {
                        if err := naSender.Send(); err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to send unsolicited NA")
                                // If there was no multicast route and the network is unreachable, ignore the
                                // error - this happens when a macvlan's parent interface is down.
                                if mcastRouteOk || !errors.Is(err, unix.ENETUNREACH) {
                                        errs = append(errs, err)
                                }
                        }
                }
                return errors.Join(errs...)
        }

        // Send an initial message. If it fails, skip the resends.
        if err := send(ctx); err != nil {
                return err
        }
        if i.advertiseAddrNMsgs == 1 {
                return nil
        }
        // Don't clean up on return from this function, there are more ARPs/NAs to send.
        stillSending = true

        // Send the rest in the background.
        go func() {
                defer cleanup()
                ctx, span := otel.Tracer("").Start(trace.ContextWithSpanContext(context.WithoutCancel(ctx), trace.SpanContext{}),
                        "libnetwork.osl.advertiseAddrs.subsequent",
                        trace.WithLinks(trace.LinkFromContext(ctx)))
                defer span.End()
                ticker := time.NewTicker(i.advertiseAddrInterval)
                defer ticker.Stop()
                for c := range i.advertiseAddrNMsgs - 1 {
                        select {
                        case <-i.stopCh:
                                log.G(ctx).Debug("Unsolicited ARP/NA sends cancelled")
                                return
                        case <-ticker.C:
                                if send(log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{"n": c + 1}))) != nil {
                                        return
                                }
                        }
                }
        }()

        return nil
}

func (n *Namespace) prepAdvertiseAddrs(ctx context.Context, i *Interface, ifIndex int) (*l2disco.UnsolARP, *l2disco.UnsolNA) {
        var ua *l2disco.UnsolARP
        var un *l2disco.UnsolNA
        if err := n.InvokeFunc(func() {
                if address4 := i.Address(); address4 != nil {
                        var err error
                        ua, err = l2disco.NewUnsolARP(ctx, address4.IP, i.MacAddress(), ifIndex)
                        if err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to prepare unsolicited ARP")
                        }
                }
                if address6 := i.AddressIPv6(); address6 != nil {
                        var err error
                        un, err = l2disco.NewUnsolNA(ctx, address6.IP, i.MacAddress(), ifIndex)
                        if err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to prepare unsolicited NA")
                        }
                }
        }); err != nil {
                log.G(ctx).WithError(err).Warn("Failed to prepare unsolicited ARP/NA messages")
                return nil, nil
        }
        return ua, un
}

// RemoveInterface removes an interface from the namespace by renaming to
// original name and moving it out of the sandbox.
func (n *Namespace) RemoveInterface(i *Interface) error {
        close(i.stopCh)

        // Find the network interface identified by the DstName attribute.
        iface, err := n.nlHandle.LinkByName(i.DstName())
        if err != nil {
                return err
        }

        // Down the interface before configuring
        if err := n.nlHandle.LinkSetDown(iface); err != nil {
                return err
        }

        // TODO(aker): Why are we doing this? This would fail if the initial interface set up failed before the "dest interface" was moved into its own namespace; see https://github.com/moby/moby/pull/46315/commits/108595c2fe852a5264b78e96f9e63cda284990a6#r1331253578
        err = n.nlHandle.LinkSetName(iface, i.SrcName())
        if err != nil {
                log.G(context.TODO()).Debugf("LinkSetName failed for interface %s: %v", i.SrcName(), err)
                return err
        }

        // if it is a bridge just delete it.
        if i.Bridge() {
                if err := n.nlHandle.LinkDel(iface); err != nil {
                        return fmt.Errorf("failed deleting bridge %q: %v", i.SrcName(), err)
                }
        } else if !n.isDefault {
                // Move the network interface to caller namespace.
                // TODO(aker): What's this really doing? There are no calls to LinkDel in this package: is this code really used? (Interface.Remove() has 3 callers); see https://github.com/moby/moby/pull/46315/commits/108595c2fe852a5264b78e96f9e63cda284990a6#r1331265335
                if err := n.nlHandle.LinkSetNsFd(iface, ns.ParseHandlerInt()); err != nil {
                        log.G(context.TODO()).Debugf("LinkSetNsFd failed for interface %s: %v", i.SrcName(), err)
                        return err
                }
        }

        n.mu.Lock()
        n.removeInterface(i)
        n.mu.Unlock()

        return nil
}

func (n *Namespace) removeInterface(i *Interface) {
        n.iFaces = slices.DeleteFunc(n.iFaces, func(iface *Interface) bool {
                return iface == i
        })
}

func (n *Namespace) configureInterface(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.configureInterface", trace.WithAttributes(
                attribute.String("ifaceName", iface.Attrs().Name)))
        defer span.End()

        ifaceName := iface.Attrs().Name
        ifaceConfigurators := []struct {
                Fn         func(context.Context, nlwrap.Handle, netlink.Link, *Interface) error
                ErrMessage string
        }{
                {setInterfaceMAC, fmt.Sprintf("error setting interface %q MAC to %q", ifaceName, i.MacAddress())},
                {setInterfaceIP, fmt.Sprintf("error setting interface %q IP to %v", ifaceName, i.Address())},
                {setInterfaceIPv6, fmt.Sprintf("error setting interface %q IPv6 to %v", ifaceName, i.AddressIPv6())},
                {setInterfaceMaster, fmt.Sprintf("error setting interface %q master to %q", ifaceName, i.DstMaster())},
                {setInterfaceLinkLocalIPs, fmt.Sprintf("error setting interface %q link local IPs to %v", ifaceName, i.LinkLocalAddresses())},
        }

        for _, config := range ifaceConfigurators {
                if err := config.Fn(ctx, nlh, iface, i); err != nil {
                        return fmt.Errorf("%s: %v", config.ErrMessage, err)
                }
        }

        if err := n.setSysctls(ctx, i.dstName, i.sysctls); err != nil {
                return err
        }

        return nil
}

func setInterfaceMaster(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        if i.DstMaster() == "" {
                return nil
        }

        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceMaster", trace.WithAttributes(
                attribute.String("i.SrcName", i.SrcName()),
                attribute.String("i.DstName", i.DstName())))
        defer span.End()

        return nlh.LinkSetMaster(iface, &netlink.Bridge{
                LinkAttrs: netlink.LinkAttrs{Name: i.DstMaster()},
        })
}

func setInterfaceMAC(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        if i.MacAddress() == nil {
                return nil
        }

        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceMAC", trace.WithAttributes(
                attribute.String("i.SrcName", i.SrcName()),
                attribute.String("i.DstName", i.DstName())))
        defer span.End()

        return nlh.LinkSetHardwareAddr(iface, i.MacAddress())
}

func setInterfaceIP(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        if i.Address() == nil {
                return nil
        }

        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceIP", trace.WithAttributes(
                attribute.String("i.SrcName", i.SrcName()),
                attribute.String("i.DstName", i.DstName())))
        defer span.End()

        if err := checkRouteConflict(nlh, i.Address(), netlink.FAMILY_V4); err != nil {
                return err
        }
        ipAddr := &netlink.Addr{IPNet: i.Address(), Label: ""}
        return nlh.AddrAdd(iface, ipAddr)
}

func setInterfaceIPv6(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        addr := i.AddressIPv6()
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceIPv6", trace.WithAttributes(
                attribute.String("i.SrcName", i.SrcName()),
                attribute.String("i.DstName", i.DstName()),
                attribute.String("i.AddressIPv6", addr.String())))
        defer span.End()

        // IPv6 must be enabled on the interface if and only if the network is
        // IPv6-enabled. For an interface on an IPv4-only network, if IPv6 isn't
        // disabled, the interface will be put into IPv6 multicast groups making
        // it unexpectedly susceptible to NDP cache poisoning, route injection, etc.
        // (At present, there will always be a pre-configured IPv6 address if the
        // network is IPv6-enabled.)
        if err := setIPv6(i.ns.path, i.DstName(), addr != nil); err != nil {
                return fmt.Errorf("failed to configure ipv6: %v", err)
        }
        if addr == nil {
                return nil
        }
        if err := checkRouteConflict(nlh, addr, netlink.FAMILY_V6); err != nil {
                return err
        }
        nlAddr := &netlink.Addr{IPNet: addr, Label: "", Flags: syscall.IFA_F_NODAD}
        return nlh.AddrAdd(iface, nlAddr)
}

func setInterfaceLinkLocalIPs(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceLinkLocalIPs", trace.WithAttributes(
                attribute.String("i.SrcName", i.SrcName()),
                attribute.String("i.DstName", i.DstName())))
        defer span.End()

        for _, llIP := range i.LinkLocalAddresses() {
                ipAddr := &netlink.Addr{IPNet: llIP}
                if err := nlh.AddrAdd(iface, ipAddr); err != nil {
                        return err
                }
        }
        return nil
}

func (n *Namespace) setSysctls(ctx context.Context, ifName string, sysctls []string) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setSysctls", trace.WithAttributes(
                attribute.String("ifName", ifName)))
        defer span.End()

        for _, sc := range sysctls {
                k, v, found := strings.Cut(sc, "=")
                if !found {
                        return fmt.Errorf("expected sysctl '%s' to have format name=value", sc)
                }
                sk := strings.Split(k, ".")
                if len(sk) != 5 {
                        return fmt.Errorf("expected sysctl '%s' to have format net.X.Y.IFNAME.Z", sc)
                }

                sysPath := filepath.Join(append([]string{"/proc/sys", sk[0], sk[1], sk[2], ifName}, sk[4:]...)...)
                var errF error
                f := func() {
                        if fi, err := os.Stat(sysPath); err != nil || !fi.Mode().IsRegular() {
                                errF = fmt.Errorf("%s is not a sysctl file", sysPath)
                        } else if curVal, err := os.ReadFile(sysPath); err != nil {
                                errF = fmt.Errorf("unable to read '%s': %w", sysPath, err)
                        } else if strings.TrimSpace(string(curVal)) == v {
                                // The value is already correct, don't try to write the file in case
                                // "/proc/sys/net" is a read-only filesystem.
                        } else if err := os.WriteFile(sysPath, []byte(v), 0o644); err != nil {
                                errF = fmt.Errorf("unable to write to '%s': %w", sysPath, err)
                        }
                }

                if err := n.InvokeFunc(f); err != nil {
                        return fmt.Errorf("failed to run sysctl setter in network namespace: %w", err)
                }
                if errF != nil {
                        return errF
                }
        }
        return nil
}

func setInterfaceName(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceName", trace.WithAttributes(
                attribute.String("ifaceName", iface.Attrs().Name)))
        defer span.End()

        return nlh.LinkSetName(iface, i.DstName())
}

func setInterfaceRoutes(ctx context.Context, nlh nlwrap.Handle, iface netlink.Link, i *Interface) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.osl.setInterfaceRoutes", trace.WithAttributes(
                attribute.String("i.SrcName", i.SrcName()),
                attribute.String("i.DstName", i.DstName())))
        defer span.End()

        for _, route := range i.Routes() {
                if route.IP.IsUnspecified() {
                        // Don't set up a default route now, it'll be set later if this interface is
                        // selected as the default gateway.
                        continue
                }
                if err := nlh.RouteAdd(&netlink.Route{
                        Scope:     netlink.SCOPE_LINK,
                        LinkIndex: iface.Attrs().Index,
                        Dst:       route,
                }); err != nil {
                        return err
                }
        }
        return nil
}

func checkRouteConflict(nlh nlwrap.Handle, address *net.IPNet, family int) error {
        routes, err := nlh.RouteList(nil, family)
        if err != nil {
                return err
        }
        for _, route := range routes {
                if route.Dst != nil && !route.Dst.IP.IsUnspecified() {
                        if route.Dst.Contains(address.IP) || address.Contains(route.Dst.IP) {
                                return fmt.Errorf("cannot program address %v in sandbox interface because it conflicts with existing route %s",
                                        address, route)
                        }
                }
        }
        return nil
}

package kernel

type conditionalCheck func(val1, val2 string) bool

// OSValue represents a tuple, value defined, check function when to apply the value
type OSValue struct {
        Value   string
        CheckFn conditionalCheck
}

func propertyIsValid(val1, val2 string, check conditionalCheck) bool {
        if check == nil || check(val1, val2) {
                return true
        }
        return false
}

package kernel

import (
        "context"
        "os"
        "path"
        "strings"

        "github.com/containerd/log"
)

// writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
func writeSystemProperty(key, value string) error {
        keyPath := strings.ReplaceAll(key, ".", "/")
        return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
}

// readSystemProperty reads the value from the path under /proc/sys and returns it
func readSystemProperty(key string) (string, error) {
        keyPath := strings.ReplaceAll(key, ".", "/")
        value, err := os.ReadFile(path.Join("/proc/sys", keyPath))
        if err != nil {
                return "", err
        }
        return strings.TrimSpace(string(value)), nil
}

// ApplyOSTweaks applies the configuration values passed as arguments
func ApplyOSTweaks(osConfig map[string]*OSValue) {
        for k, v := range osConfig {
                // read the existing property from disk
                oldv, err := readSystemProperty(k)
                if err != nil {
                        log.G(context.TODO()).WithError(err).Errorf("error reading the kernel parameter %s", k)
                        continue
                }

                if propertyIsValid(oldv, v.Value, v.CheckFn) {
                        // write new prop value to disk
                        if err := writeSystemProperty(k, v.Value); err != nil {
                                log.G(context.TODO()).WithError(err).Errorf("error setting the kernel parameter %s = %s, (leaving as %s)", k, v.Value, oldv)
                                continue
                        }
                        log.G(context.TODO()).Debugf("updated kernel parameter %s = %s (was %s)", k, v.Value, oldv)
                }
        }
}

package osl

import (
        "context"
        "errors"
        "fmt"
        "net"
        "os"
        "path/filepath"
        "runtime"
        "strconv"
        "strings"
        "sync"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/docker/docker/daemon/libnetwork/osl/kernel"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/nlwrap"
        "github.com/docker/docker/internal/unshare"
        "github.com/vishvananda/netlink"
        "github.com/vishvananda/netlink/nl"
        "github.com/vishvananda/netns"
        "golang.org/x/sys/unix"
)

const defaultPrefix = "/var/run/docker"

func init() {
        // Lock main() to the initial thread to exclude the goroutines spawned
        // by func (*Namespace) InvokeFunc() or func setIPv6() below from
        // being scheduled onto that thread. Changes to the network namespace of
        // the initial thread alter /proc/self/ns/net, which would break any
        // code which (incorrectly) assumes that the file is the network
        // namespace for the thread it is currently executing on.
        runtime.LockOSThread()
}

var (
        once          sync.Once
        netnsBasePath = filepath.Join(defaultPrefix, "netns")
)

// SetBasePath sets the base url prefix for the ns path
func SetBasePath(path string) {
        netnsBasePath = filepath.Join(path, "netns")
}

func basePath() string {
        return netnsBasePath
}

func createBasePath() {
        err := os.MkdirAll(basePath(), 0o755)
        if err != nil {
                panic("Could not create net namespace path directory")
        }
}

// GenerateKey generates a sandbox key based on the passed
// container id.
func GenerateKey(containerID string) string {
        maxLen := 12
        // Read sandbox key from host for overlay
        if strings.HasPrefix(containerID, "-") {
                var (
                        index    int
                        indexStr string
                        tmpkey   string
                )
                dir, err := os.ReadDir(basePath())
                if err != nil {
                        return ""
                }

                for _, v := range dir {
                        id := v.Name()
                        if strings.HasSuffix(id, containerID[:maxLen-1]) {
                                indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
                                tmpindex, err := strconv.Atoi(indexStr)
                                if err != nil {
                                        return ""
                                }
                                if tmpindex > index {
                                        index = tmpindex
                                        tmpkey = id
                                }
                        }
                }
                containerID = tmpkey
                if containerID == "" {
                        return ""
                }
        }

        if len(containerID) < maxLen {
                maxLen = len(containerID)
        }

        return basePath() + "/" + containerID[:maxLen]
}

// NewSandbox provides a new Namespace instance created in an os specific way
// provided a key which uniquely identifies the sandbox.
func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) {
        if !isRestore {
                err := createNetworkNamespace(key, osCreate)
                if err != nil {
                        return nil, err
                }
        } else {
                once.Do(createBasePath)
        }

        n := &Namespace{path: key, isDefault: !osCreate}

        sboxNs, err := netns.GetFromPath(n.path)
        if err != nil {
                return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
        }
        defer sboxNs.Close()

        n.nlHandle, err = nlwrap.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
        if err != nil {
                return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
        }

        err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
        }

        if err = n.loopbackUp(); err != nil {
                n.nlHandle.Close()
                return nil, err
        }

        return n, nil
}

func mountNetworkNamespace(basePath string, lnPath string) error {
        err := syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
        if err != nil {
                return fmt.Errorf("bind-mount %s -> %s: %w", basePath, lnPath, err)
        }
        return nil
}

// GetSandboxForExternalKey returns sandbox object for the supplied path
func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) {
        if err := createNamespaceFile(key); err != nil {
                return nil, err
        }

        if err := mountNetworkNamespace(basePath, key); err != nil {
                return nil, err
        }
        n := &Namespace{path: key}

        sboxNs, err := netns.GetFromPath(n.path)
        if err != nil {
                return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
        }
        defer sboxNs.Close()

        n.nlHandle, err = nlwrap.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
        if err != nil {
                return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
        }

        err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
        if err != nil {
                log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
        }

        if err = n.loopbackUp(); err != nil {
                n.nlHandle.Close()
                return nil, err
        }

        return n, nil
}

func createNetworkNamespace(path string, osCreate bool) error {
        if err := createNamespaceFile(path); err != nil {
                return err
        }

        do := func() error {
                return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
        }
        if osCreate {
                return unshare.Go(unix.CLONE_NEWNET, do, nil)
        }
        return do()
}

func unmountNamespaceFile(path string) {
        if _, err := os.Stat(path); err != nil {
                // ignore when we cannot stat the path
                return
        }
        if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
                log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
        }
}

func createNamespaceFile(path string) error {
        once.Do(createBasePath)

        // If the path is there unmount it first
        unmountNamespaceFile(path)

        f, err := os.Create(path)
        if err != nil {
                return err
        }
        _ = f.Close()
        return nil
}

// Namespace represents a network sandbox. It represents a Linux network
// namespace, and moves an interface into it when called on method AddInterface
// or sets the gateway etc. It holds a list of Interfaces, routes etc., and more
// can be added dynamically.
type Namespace struct {
        path                string // path is the absolute path to the network namespace. It is safe to access it concurrently.
        iFaces              []*Interface
        gw                  net.IP
        gwv6                net.IP
        defRoute4SrcName    string
        defRoute6SrcName    string
        staticRoutes        []*types.StaticRoute
        isDefault           bool // isDefault is true when Namespace represents the host network namespace. It is safe to access it concurrently.
        ipv6LoEnabledOnce   sync.Once
        ipv6LoEnabledCached bool
        nlHandle            nlwrap.Handle // nlHandle is the netlink handle for the network namespace. It is safe to access it concurrently.
        mu                  sync.Mutex
}

// Interfaces returns the collection of Interface previously added with the AddInterface
// method. Note that this doesn't include network interfaces added in any
// other way (such as the default loopback interface which is automatically
// created on creation of a sandbox).
func (n *Namespace) Interfaces() []*Interface {
        ifaces := make([]*Interface, len(n.iFaces))
        copy(ifaces, n.iFaces)
        return ifaces
}

func (n *Namespace) ifaceBySrcName(srcName string) *Interface {
        n.mu.Lock()
        defer n.mu.Unlock()
        for _, iface := range n.iFaces {
                if iface.srcName == srcName {
                        return iface
                }
        }
        return nil
}

func (n *Namespace) loopbackUp() error {
        iface, err := n.nlHandle.LinkByName("lo")
        if err != nil {
                return err
        }
        return n.nlHandle.LinkSetUp(iface)
}

// GetLoopbackIfaceName returns the name of the loopback interface
func (n *Namespace) GetLoopbackIfaceName() string {
        return "lo"
}

// AddAliasIP adds the passed IP address to the named interface
func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error {
        iface, err := n.nlHandle.LinkByName(ifName)
        if err != nil {
                return err
        }
        return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
}

// RemoveAliasIP removes the passed IP address from the named interface
func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
        iface, err := n.nlHandle.LinkByName(ifName)
        if err != nil {
                return err
        }
        return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
}

// DisableARPForVIP disables ARP replies and requests for VIP addresses
// on a particular interface.
func (n *Namespace) DisableARPForVIP(srcName string) (retErr error) {
        dstName := ""
        for _, i := range n.Interfaces() {
                if i.SrcName() == srcName {
                        dstName = i.DstName()
                        break
                }
        }
        if dstName == "" {
                return fmt.Errorf("failed to find interface %s in sandbox", srcName)
        }

        err := n.InvokeFunc(func() {
                path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
                if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
                        retErr = fmt.Errorf("Failed to set %s to 1: %v", path, err)
                        return
                }
                path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
                if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
                        retErr = fmt.Errorf("Failed to set %s to 2: %v", path, err)
                        return
                }
        })
        if err != nil {
                return err
        }
        return retErr
}

// InvokeFunc invoke a function in the network namespace.
func (n *Namespace) InvokeFunc(f func()) error {
        path := n.nsPath()
        newNS, err := netns.GetFromPath(path)
        if err != nil {
                return fmt.Errorf("failed get network namespace %q: %w", path, err)
        }
        defer newNS.Close()

        done := make(chan error, 1)
        go func() {
                runtime.LockOSThread()
                // InvokeFunc() could have been called from a goroutine with
                // tampered thread state, e.g. from another InvokeFunc()
                // callback. The outer goroutine's thread state cannot be
                // trusted.
                origNS, err := netns.Get()
                if err != nil {
                        runtime.UnlockOSThread()
                        done <- fmt.Errorf("failed to get original network namespace: %w", err)
                        return
                }
                defer origNS.Close()

                if err := netns.Set(newNS); err != nil {
                        runtime.UnlockOSThread()
                        done <- err
                        return
                }
                defer func() {
                        close(done)
                        if err := netns.Set(origNS); err != nil {
                                log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
                                // Recover from the error by leaving this goroutine locked to
                                // the thread. The runtime will terminate the thread and replace
                                // it with a clean one when this goroutine returns.
                        } else {
                                runtime.UnlockOSThread()
                        }
                }()
                f()
        }()
        return <-done
}

func (n *Namespace) nsPath() string {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.path
}

// Key returns the path where the network namespace is mounted.
func (n *Namespace) Key() string {
        return n.path
}

// Destroy destroys the sandbox.
func (n *Namespace) Destroy() error {
        n.nlHandle.Handle.Close()
        // Assuming no running process is executing in this network namespace,
        // unmounting is sufficient to destroy it.
        if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
                return err
        }

        // Remove the path where the netns was mounted
        if err := os.Remove(n.path); err != nil {
                log.G(context.TODO()).WithError(err).Error("error removing namespace file")
        }
        return nil
}

// RestoreInterfaces restores the network namespace's interfaces.
func (n *Namespace) RestoreInterfaces(interfaces map[Iface][]IfaceOption) error {
        // restore interfaces
        for iface, opts := range interfaces {
                i, err := newInterface(n, iface.SrcName, iface.DstPrefix, iface.DstName, opts...)
                if err != nil {
                        return err
                }
                if n.isDefault {
                        i.dstName = i.srcName
                } else {
                        links, err := n.nlHandle.LinkList()
                        if err != nil {
                                return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
                        }
                        // due to the docker network connect/disconnect, so the dstName should
                        // restore from the namespace
                        for _, link := range links {
                                ifaceName := link.Attrs().Name
                                if i.dstPrefix == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") {
                                        i.dstName = ifaceName
                                        break
                                }
                                // find the interface name by ip
                                findIfname := func(needle *net.IPNet, haystack []netlink.Addr) (string, bool) {
                                        for _, addr := range haystack {
                                                if addr.IPNet.String() == needle.String() {
                                                        return ifaceName, true
                                                }
                                        }
                                        return "", false
                                }
                                if i.address != nil {
                                        addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
                                        if err != nil {
                                                return err
                                        }
                                        if name, found := findIfname(i.address, addresses); found {
                                                i.dstName = name
                                                break
                                        }
                                }
                                if i.addressIPv6 != nil {
                                        addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V6)
                                        if err != nil {
                                                return err
                                        }
                                        if name, found := findIfname(i.address, addresses); found {
                                                i.dstName = name
                                                break
                                        }
                                }
                                // This is to find the interface name of the pair in overlay sandbox
                                if i.master != "" && i.dstPrefix == "veth" && strings.HasPrefix(ifaceName, "veth") {
                                        i.dstName = ifaceName
                                }
                        }

                        n.mu.Lock()
                        n.iFaces = append(n.iFaces, i)
                        n.mu.Unlock()
                }
        }
        return nil
}

func (n *Namespace) RestoreRoutes(routes []*types.StaticRoute) {
        n.mu.Lock()
        defer n.mu.Unlock()
        n.staticRoutes = append(n.staticRoutes, routes...)
}

func (n *Namespace) RestoreGateway(ipv4 bool, gw net.IP, srcName string) {
        n.mu.Lock()
        defer n.mu.Unlock()

        if gw == nil {
                // There's no gateway address, so the default route is bound to the interface.
                if ipv4 {
                        n.defRoute4SrcName = srcName
                } else {
                        n.defRoute6SrcName = srcName
                }
                return
        }

        if ipv4 {
                n.gw = gw
        } else {
                n.gwv6 = gw
        }
}

// IPv6LoEnabled returns true if the loopback interface had an IPv6 address when
// last checked. It's always checked on the first call, and by RefreshIPv6LoEnabled.
// ('::1' is assigned by the kernel if IPv6 is enabled.)
func (n *Namespace) IPv6LoEnabled() bool {
        n.ipv6LoEnabledOnce.Do(func() {
                n.RefreshIPv6LoEnabled()
        })
        n.mu.Lock()
        defer n.mu.Unlock()
        return n.ipv6LoEnabledCached
}

// RefreshIPv6LoEnabled refreshes the cached result returned by IPv6LoEnabled.
func (n *Namespace) RefreshIPv6LoEnabled() {
        n.mu.Lock()
        defer n.mu.Unlock()

        // If anything goes wrong, assume no-IPv6.
        n.ipv6LoEnabledCached = false
        iface, err := n.nlHandle.LinkByName("lo")
        if err != nil {
                log.G(context.TODO()).WithError(err).Warn("Unable to find 'lo' to determine IPv6 support")
                return
        }
        addrs, err := n.nlHandle.AddrList(iface, nl.FAMILY_V6)
        if err != nil {
                log.G(context.TODO()).WithError(err).Warn("Unable to get 'lo' addresses to determine IPv6 support")
                return
        }
        n.ipv6LoEnabledCached = len(addrs) > 0
}

// ApplyOSTweaks applies operating system specific knobs on the sandbox.
func (n *Namespace) ApplyOSTweaks(types []SandboxType) {
        for _, t := range types {
                switch t {
                case SandboxTypeLoadBalancer, SandboxTypeIngress:
                        kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
                                // disables any special handling on port reuse of existing IPVS connection table entries
                                // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
                                "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
                                // expires connection from the IPVS connection table when the backend is not available
                                // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
                                "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
                                // expires persistent connections to destination servers with weights set to 0
                                // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
                                "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
                        })
                }
        }
}

func setIPv6(nspath, iface string, enable bool) error {
        errCh := make(chan error, 1)
        go func() {
                defer close(errCh)

                namespace, err := netns.GetFromPath(nspath)
                if err != nil {
                        errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
                        return
                }
                defer namespace.Close()

                runtime.LockOSThread()

                origNS, err := netns.Get()
                if err != nil {
                        runtime.UnlockOSThread()
                        errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
                        return
                }
                defer origNS.Close()

                if err = netns.Set(namespace); err != nil {
                        runtime.UnlockOSThread()
                        errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
                        return
                }
                defer func() {
                        if err := netns.Set(origNS); err != nil {
                                log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
                                // The error is only fatal for the current thread. Keep this
                                // goroutine locked to the thread to make the runtime replace it
                                // with a clean thread once this goroutine returns.
                        } else {
                                runtime.UnlockOSThread()
                        }
                }()

                path := "/proc/sys/net/ipv6/conf/" + iface + "/disable_ipv6"
                value := byte('1')
                if enable {
                        value = '0'
                }

                if curVal, err := os.ReadFile(path); err != nil {
                        if os.IsNotExist(err) {
                                if enable {
                                        log.G(context.TODO()).WithError(err).Warn("Cannot enable IPv6 on container interface. Has IPv6 been disabled in this node's kernel?")
                                } else {
                                        log.G(context.TODO()).WithError(err).Debug("Not disabling IPv6 on container interface. Has IPv6 been disabled in this node's kernel?")
                                }
                                return
                        }
                        errCh <- err
                        return
                } else if len(curVal) > 0 && curVal[0] == value {
                        // Nothing to do, the setting is already correct.
                        return
                }

                if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil || os.Getenv("DOCKER_TEST_RO_DISABLE_IPV6") != "" {
                        logger := log.G(context.TODO()).WithFields(log.Fields{
                                "error":     err,
                                "interface": iface,
                        })
                        if enable {
                                // The user asked for IPv6 on the interface, and we can't give it to them.
                                // But, in line with the IsNotExist case above, just log.
                                logger.Warn("Cannot enable IPv6 on container interface, continuing.")
                        } else {
                                logger.Error("Cannot disable IPv6 on container interface.")
                                errCh <- errors.New("failed to disable IPv6 on container's interface " + iface)
                        }
                        return
                }
        }()
        return <-errCh
}

package osl

import (
        "context"
        "errors"
        "fmt"
        "net"
        "os"
        "strings"

        "github.com/containerd/log"
        "github.com/vishvananda/netlink"
)

// NeighborSearchError indicates that the neighbor is already present
type NeighborSearchError struct {
        ip       net.IP
        mac      net.HardwareAddr
        linkName string
        present  bool
}

func (n NeighborSearchError) Error() string {
        var b strings.Builder
        b.WriteString("neighbor entry ")
        if n.present {
                b.WriteString("already exists ")
        } else {
                b.WriteString("not found ")
        }
        b.WriteString("for IP ")
        b.WriteString(n.ip.String())
        b.WriteString(", mac ")
        b.WriteString(n.mac.String())
        if n.linkName != "" {
                b.WriteString(", link ")
                b.WriteString(n.linkName)
        }
        return b.String()
}

// DeleteNeighbor deletes a neighbor entry from the sandbox.
//
// To delete an entry inserted by [AddNeighbor] the caller must provide the same
// parameters used to add it.
func (n *Namespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr, options ...NeighOption) error {
        nlnh, linkName, err := n.nlNeigh(dstIP, dstMac, options...)
        if err != nil {
                return err
        }

        if err := n.nlHandle.NeighDel(nlnh); err != nil {
                log.G(context.TODO()).WithFields(log.Fields{
                        "ip":    dstIP,
                        "mac":   dstMac,
                        "ifc":   linkName,
                        "error": err,
                }).Warn("error deleting neighbor entry")
                if errors.Is(err, os.ErrNotExist) {
                        return NeighborSearchError{dstIP, dstMac, linkName, false}
                }
                return fmt.Errorf("could not delete neighbor %+v: %w", nlnh, err)
        }

        // Delete the dynamic entry in the bridge
        if nlnh.Family > 0 {
                nlnh.Flags = netlink.NTF_MASTER
                if err := n.nlHandle.NeighDel(nlnh); err != nil && !errors.Is(err, os.ErrNotExist) {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "ip":    dstIP,
                                "mac":   dstMac,
                                "ifc":   linkName,
                                "error": err,
                        }).Warn("error deleting dynamic neighbor entry")
                }
        }

        log.G(context.TODO()).WithFields(log.Fields{
                "ip":  dstIP,
                "mac": dstMac,
                "ifc": linkName,
        }).Debug("Neighbor entry deleted")

        return nil
}

// AddNeighbor adds a neighbor entry into the sandbox.
func (n *Namespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, options ...NeighOption) error {
        nlnh, linkName, err := n.nlNeigh(dstIP, dstMac, options...)
        if err != nil {
                return err
        }

        if err := n.nlHandle.NeighAdd(nlnh); err != nil {
                if errors.Is(err, os.ErrExist) {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "ip":    dstIP,
                                "mac":   dstMac,
                                "ifc":   linkName,
                                "neigh": fmt.Sprintf("%+v", nlnh),
                        }).Warn("Neighbor entry already present")
                        return NeighborSearchError{dstIP, dstMac, linkName, true}
                } else {
                        return fmt.Errorf("could not add neighbor entry %+v: %w", nlnh, err)
                }
        }

        log.G(context.TODO()).WithFields(log.Fields{
                "ip":  dstIP,
                "mac": dstMac,
                "ifc": linkName,
        }).Debug("Neighbor entry added")

        return nil
}

type neigh struct {
        linkName string
        family   int
}

func (n *Namespace) nlNeigh(dstIP net.IP, dstMac net.HardwareAddr, options ...NeighOption) (*netlink.Neigh, string, error) {
        var nh neigh
        nh.processNeighOptions(options...)

        nlnh := &netlink.Neigh{
                IP:           dstIP,
                HardwareAddr: dstMac,
                State:        netlink.NUD_PERMANENT,
                Family:       nh.family,
        }

        if nlnh.Family > 0 {
                nlnh.Flags = netlink.NTF_SELF
        }

        if nh.linkName != "" {
                linkDst := n.findDst(nh.linkName, false)
                if linkDst == "" {
                        return nil, nh.linkName, fmt.Errorf("could not find the interface with name %s", nh.linkName)
                }
                iface, err := n.nlHandle.LinkByName(linkDst)
                if err != nil {
                        return nil, nh.linkName, fmt.Errorf("could not find interface with destination name %s: %w", linkDst, err)
                }
                nlnh.LinkIndex = iface.Attrs().Index
        }

        return nlnh, nh.linkName, nil
}

package osl

import (
        "strconv"
        "strings"

        "golang.org/x/sys/unix"
)

type deviceFlags uint32

var deviceFlagStrings = map[deviceFlags]string{
        unix.IFF_UP:          "IFF_UP",
        unix.IFF_BROADCAST:   "IFF_BROADCAST",
        unix.IFF_DEBUG:       "IFF_DEBUG",
        unix.IFF_LOOPBACK:    "IFF_LOOPBACK",
        unix.IFF_POINTOPOINT: "IFF_POINTOPOINT",
        unix.IFF_RUNNING:     "IFF_RUNNING",
        unix.IFF_NOARP:       "IFF_NOARP",
        unix.IFF_PROMISC:     "IFF_PROMISC",
        unix.IFF_NOTRAILERS:  "IFF_NOTRAILERS",
        unix.IFF_ALLMULTI:    "IFF_ALLMULTI",
        unix.IFF_MASTER:      "IFF_MASTER",
        unix.IFF_SLAVE:       "IFF_SLAVE",
        unix.IFF_MULTICAST:   "IFF_MULTICAST",
        unix.IFF_PORTSEL:     "IFF_PORTSEL",
        unix.IFF_AUTOMEDIA:   "IFF_AUTOMEDIA",
        unix.IFF_DYNAMIC:     "IFF_DYNAMIC",
        unix.IFF_LOWER_UP:    "IFF_LOWER_UP",
        unix.IFF_DORMANT:     "IFF_DORMANT",
        unix.IFF_ECHO:        "IFF_ECHO",
}

func (d deviceFlags) String() string {
        var (
                flags   []string
                unknown uint32
        )

        for i := uint(0); i < 32; i++ {
                if d&(1<<i) != 0 {
                        if s, ok := deviceFlagStrings[deviceFlags(1<<i)]; ok {
                                flags = append(flags, s)
                        } else {
                                unknown |= 1 << i
                        }
                }
        }
        if unknown != 0 {
                flags = append(flags, "0x"+strconv.FormatUint(uint64(unknown), 16))
        }

        return "deviceFlags(" + strings.Join(flags, " | ") + ")"
}

package osl

import (
        "fmt"
        "net"
        "time"
)

func (nh *neigh) processNeighOptions(options ...NeighOption) {
        for _, opt := range options {
                if opt != nil {
                        opt(nh)
                }
        }
}

// WithLinkName sets the srcName of the link to use in the neighbor entry.
func WithLinkName(name string) NeighOption {
        return func(nh *neigh) {
                nh.linkName = name
        }
}

// WithFamily sets the address-family for the neighbor entry. e.g. [syscall.AF_BRIDGE].
func WithFamily(family int) NeighOption {
        return func(nh *neigh) {
                nh.family = family
        }
}

// WithIsBridge sets whether the interface is a bridge.
func WithIsBridge(isBridge bool) IfaceOption {
        return func(i *Interface) error {
                i.bridge = isBridge
                return nil
        }
}

// WithMaster sets the master interface (if any) for this interface. The
// master interface name should refer to the srcName of a previously added
// interface of type bridge.
func WithMaster(name string) IfaceOption {
        return func(i *Interface) error {
                i.master = name
                return nil
        }
}

// WithMACAddress sets the interface MAC-address.
func WithMACAddress(mac net.HardwareAddr) IfaceOption {
        return func(i *Interface) error {
                i.mac = mac
                return nil
        }
}

// WithIPv4Address sets the IPv4 address of the interface.
func WithIPv4Address(addr *net.IPNet) IfaceOption {
        return func(i *Interface) error {
                i.address = addr
                return nil
        }
}

// WithIPv6Address sets the IPv6 address of the interface.
func WithIPv6Address(addr *net.IPNet) IfaceOption {
        return func(i *Interface) error {
                i.addressIPv6 = addr
                return nil
        }
}

// WithLinkLocalAddresses set the link-local IP addresses of the interface.
func WithLinkLocalAddresses(list []*net.IPNet) IfaceOption {
        return func(i *Interface) error {
                i.llAddrs = list
                return nil
        }
}

// WithRoutes sets the interface routes.
func WithRoutes(routes []*net.IPNet) IfaceOption {
        return func(i *Interface) error {
                i.routes = routes
                return nil
        }
}

// WithSysctls sets the interface sysctls.
func WithSysctls(sysctls []string) IfaceOption {
        return func(i *Interface) error {
                i.sysctls = sysctls
                return nil
        }
}

// WithAdvertiseAddrNMsgs sets the number of unsolicited ARP/NA messages that will
// be sent to advertise a network interface's addresses.
func WithAdvertiseAddrNMsgs(nMsgs int) IfaceOption {
        return func(i *Interface) error {
                if nMsgs < AdvertiseAddrNMsgsMin || nMsgs > AdvertiseAddrNMsgsMax {
                        return fmt.Errorf("AdvertiseAddrNMsgs %d is not in the range %d to %d",
                                nMsgs, AdvertiseAddrNMsgsMin, AdvertiseAddrNMsgsMax)
                }
                i.advertiseAddrNMsgs = nMsgs
                return nil
        }
}

// WithAdvertiseAddrInterval sets the interval between unsolicited ARP/NA messages
// sent to advertise a network interface's addresses.
func WithAdvertiseAddrInterval(interval time.Duration) IfaceOption {
        return func(i *Interface) error {
                if interval < AdvertiseAddrIntervalMin || interval > AdvertiseAddrIntervalMax {
                        return fmt.Errorf("AdvertiseAddrNMsgs %d is not in the range %v to %v milliseconds",
                                interval, AdvertiseAddrIntervalMin, AdvertiseAddrIntervalMax)
                }
                i.advertiseAddrInterval = interval
                return nil
        }
}

// WithCreatedInContainer can be used to say the network driver created the
// interface in the container's network namespace (and, therefore, it doesn't
// need to be moved into that namespace.)
func WithCreatedInContainer(cic bool) IfaceOption {
        return func(i *Interface) error {
                i.createdInContainer = cic
                return nil
        }
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23 && (linux || freebsd)

package osl

import (
        "errors"
        "fmt"
        "net"
        "slices"

        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/vishvananda/netlink"
)

// Gateway returns the IPv4 gateway for the sandbox.
func (n *Namespace) Gateway() net.IP {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.gw
}

// GatewayIPv6 returns the IPv6 gateway for the sandbox.
func (n *Namespace) GatewayIPv6() net.IP {
        n.mu.Lock()
        defer n.mu.Unlock()

        return n.gwv6
}

// StaticRoutes returns additional static routes for the sandbox. Note that
// directly connected routes are stored on the particular interface they
// refer to.
func (n *Namespace) StaticRoutes() []*types.StaticRoute {
        n.mu.Lock()
        defer n.mu.Unlock()

        routes := make([]*types.StaticRoute, len(n.staticRoutes))
        for i, route := range n.staticRoutes {
                r := route.GetCopy()
                routes[i] = r
        }

        return routes
}

// SetGateway sets the default IPv4 gateway for the sandbox. It is a no-op
// if the given gateway is empty.
func (n *Namespace) SetGateway(gw net.IP) error {
        if len(gw) == 0 {
                return nil
        }

        if err := n.programGateway(gw, true); err != nil {
                return err
        }
        n.mu.Lock()
        n.gw = gw
        n.mu.Unlock()
        return nil
}

// UnsetGateway the previously set default IPv4 gateway in the sandbox.
// It is a no-op if no gateway was set.
func (n *Namespace) UnsetGateway() error {
        gw := n.Gateway()
        if len(gw) == 0 {
                return nil
        }

        if err := n.programGateway(gw, false); err != nil {
                return err
        }
        n.mu.Lock()
        n.gw = net.IP{}
        n.mu.Unlock()
        return nil
}

func (n *Namespace) programGateway(gw net.IP, isAdd bool) error {
        gwRoutes, err := n.nlHandle.RouteGet(gw)
        if err != nil {
                return fmt.Errorf("route for the gateway %s could not be found: %v", gw, err)
        }

        var linkIndex int
        for _, gwRoute := range gwRoutes {
                if gwRoute.Gw == nil {
                        linkIndex = gwRoute.LinkIndex
                        break
                }
        }

        if linkIndex == 0 {
                return fmt.Errorf("direct route for the gateway %s could not be found", gw)
        }

        if isAdd {
                return n.nlHandle.RouteAdd(&netlink.Route{
                        Scope:     netlink.SCOPE_UNIVERSE,
                        LinkIndex: linkIndex,
                        Gw:        gw,
                })
        }

        return n.nlHandle.RouteDel(&netlink.Route{
                Scope:     netlink.SCOPE_UNIVERSE,
                LinkIndex: linkIndex,
                Gw:        gw,
        })
}

// Program a route in to the namespace routing table.
func (n *Namespace) programRoute(dest *net.IPNet, nh net.IP) error {
        gwRoutes, err := n.nlHandle.RouteGet(nh)
        if err != nil {
                return fmt.Errorf("route for the next hop %s could not be found: %v", nh, err)
        }

        return n.nlHandle.RouteAdd(&netlink.Route{
                Scope:     netlink.SCOPE_UNIVERSE,
                LinkIndex: gwRoutes[0].LinkIndex,
                Gw:        nh,
                Dst:       dest,
        })
}

// Delete a route from the namespace routing table.
func (n *Namespace) removeRoute(dest *net.IPNet, nh net.IP) error {
        gwRoutes, err := n.nlHandle.RouteGet(nh)
        if err != nil {
                return fmt.Errorf("route for the next hop could not be found: %v", err)
        }

        return n.nlHandle.RouteDel(&netlink.Route{
                Scope:     netlink.SCOPE_UNIVERSE,
                LinkIndex: gwRoutes[0].LinkIndex,
                Gw:        nh,
                Dst:       dest,
        })
}

// SetGatewayIPv6 sets the default IPv6 gateway for the sandbox. It is a no-op
// if the given gateway is empty.
func (n *Namespace) SetGatewayIPv6(gwv6 net.IP) error {
        if len(gwv6) == 0 {
                return nil
        }

        if err := n.programGateway(gwv6, true); err != nil {
                return err
        }

        n.mu.Lock()
        n.gwv6 = gwv6
        n.mu.Unlock()
        return nil
}

// UnsetGatewayIPv6 unsets the previously set default IPv6 gateway in the sandbox.
// It is a no-op if no gateway was set.
func (n *Namespace) UnsetGatewayIPv6() error {
        gwv6 := n.GatewayIPv6()
        if len(gwv6) == 0 {
                return nil
        }

        if err := n.programGateway(gwv6, false); err != nil {
                return err
        }

        n.mu.Lock()
        n.gwv6 = net.IP{}
        n.mu.Unlock()
        return nil
}

// AddStaticRoute adds a static route to the sandbox.
func (n *Namespace) AddStaticRoute(r *types.StaticRoute) error {
        if err := n.programRoute(r.Destination, r.NextHop); err != nil {
                return err
        }

        n.mu.Lock()
        n.staticRoutes = append(n.staticRoutes, r)
        n.mu.Unlock()
        return nil
}

// RemoveStaticRoute removes a static route from the sandbox.
func (n *Namespace) RemoveStaticRoute(r *types.StaticRoute) error {
        if err := n.removeRoute(r.Destination, r.NextHop); err != nil {
                return err
        }

        n.mu.Lock()
        lastIndex := len(n.staticRoutes) - 1
        for i, v := range n.staticRoutes {
                if v == r {
                        // Overwrite the route we're removing with the last element
                        n.staticRoutes[i] = n.staticRoutes[lastIndex]
                        // Shorten the slice to trim the extra element
                        n.staticRoutes = n.staticRoutes[:lastIndex]
                        break
                }
        }
        n.mu.Unlock()
        return nil
}

// SetDefaultRouteIPv4 sets up a connected route to 0.0.0.0 via the Interface
// with srcName, if that Interface has a route to 0.0.0.0. Otherwise, it
// returns an error.
func (n *Namespace) SetDefaultRouteIPv4(srcName string) error {
        if err := n.setDefaultRoute(srcName, func(ipNet *net.IPNet) bool {
                return ipNet.IP.IsUnspecified() && ipNet.IP.To4() != nil
        }); err != nil {
                return fmt.Errorf("setting IPv4 default route to interface with srcName '%s': %w", srcName, err)
        }

        n.mu.Lock()
        n.defRoute4SrcName = srcName
        n.mu.Unlock()
        return nil
}

// SetDefaultRouteIPv6 sets up a connected route to [::] via the Interface
// with srcName, if that Interface has a route to [::]. Otherwise, it
// returns an error.
func (n *Namespace) SetDefaultRouteIPv6(srcName string) error {
        if err := n.setDefaultRoute(srcName, func(ipNet *net.IPNet) bool {
                return ipNet.IP.IsUnspecified() && ipNet.IP.To4() == nil
        }); err != nil {
                return fmt.Errorf("setting IPv6 default route to interface with srcName '%s': %w", srcName, err)
        }

        n.mu.Lock()
        n.defRoute6SrcName = srcName
        n.mu.Unlock()
        return nil
}

func (n *Namespace) setDefaultRoute(srcName string, routeMatcher func(*net.IPNet) bool) error {
        iface := n.ifaceBySrcName(srcName)
        if iface == nil {
                return errors.New("no interface")
        }

        ridx := slices.IndexFunc(iface.routes, routeMatcher)
        if ridx == -1 {
                return errors.New("no default route")
        }

        link, err := n.nlHandle.LinkByName(iface.dstName)
        if err != nil {
                return fmt.Errorf("no link src:%s dst:%s", srcName, iface.dstName)
        }

        if err := n.nlHandle.RouteAdd(&netlink.Route{
                Scope:     netlink.SCOPE_LINK,
                LinkIndex: link.Attrs().Index,
                Dst:       iface.routes[ridx],
        }); err != nil {
                return err
        }
        return nil
}

// UnsetDefaultRouteIPv4 unsets the previously set default IPv4 default route
// in the sandbox. It is a no-op if no gateway was set.
func (n *Namespace) UnsetDefaultRouteIPv4() error {
        n.mu.Lock()
        srcName := n.defRoute4SrcName
        n.mu.Unlock()

        if err := n.unsetDefaultRoute(srcName, func(ipNet *net.IPNet) bool {
                return ipNet.IP.IsUnspecified() && ipNet.IP.To4() != nil
        }); err != nil {
                return fmt.Errorf("removing IPv4 default route to interface with srcName '%s': %w", srcName, err)
        }

        n.mu.Lock()
        n.defRoute4SrcName = ""
        n.mu.Unlock()
        return nil
}

// UnsetDefaultRouteIPv6 unsets the previously set default IPv6 default route
// in the sandbox. It is a no-op if no gateway was set.
func (n *Namespace) UnsetDefaultRouteIPv6() error {
        n.mu.Lock()
        srcName := n.defRoute6SrcName
        n.mu.Unlock()

        if err := n.unsetDefaultRoute(srcName, func(ipNet *net.IPNet) bool {
                return ipNet.IP.IsUnspecified() && ipNet.IP.To4() == nil
        }); err != nil {
                return fmt.Errorf("removing IPv6 default route to interface with srcName '%s': %w", srcName, err)
        }

        n.mu.Lock()
        n.defRoute6SrcName = ""
        n.mu.Unlock()
        return nil
}

func (n *Namespace) unsetDefaultRoute(srcName string, routeMatcher func(*net.IPNet) bool) error {
        if srcName == "" {
                return nil
        }

        iface := n.ifaceBySrcName(srcName)
        if iface == nil {
                return nil
        }

        ridx := slices.IndexFunc(iface.routes, routeMatcher)
        if ridx == -1 {
                return errors.New("no default route")
        }

        link, err := n.nlHandle.LinkByName(iface.dstName)
        if err != nil {
                return errors.New("no link")
        }

        return n.nlHandle.RouteDel(&netlink.Route{
                Scope:     netlink.SCOPE_LINK,
                LinkIndex: link.Attrs().Index,
                Dst:       iface.routes[ridx],
        })
}

package portallocator

import (
        "context"
        "fmt"
        "net"
        "net/netip"
        "os"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/ishidawataru/sctp"
)

type OSAllocator struct {
        // allocator is used to logically reserve ports, to avoid those we know
        // are already in use. This is useful to ensure callers don't burn their
        // retry budget unnecessarily.
        allocator *PortAllocator
}

func NewOSAllocator() OSAllocator {
        return OSAllocator{
                allocator: Get(),
        }
}

// RequestPortsInRange reserves a port available in the range [portStart, portEnd]
// for all the specified addrs, and then try to bind those addresses to allocate
// the port from the OS. It returns the allocated port, and all the sockets
// bound, or an error if the reserved port isn't available. Callers must take
// care of closing the returned sockets.
//
// Due to the semantic of SO_REUSEADDR, the OSAllocator can't fully determine
// if a port is free when binding 0.0.0.0 or ::. If another socket is binding
// the same port, but it's not listening to it yet, the bind will succeed but a
// subsequent listen might fail. For this reason, RequestPortsInRange doesn't
// retry on failure — it's caller's responsibility.
//
// It's safe for concurrent use.
func (pa OSAllocator) RequestPortsInRange(addrs []net.IP, proto types.Protocol, portStart, portEnd int) (_ int, _ []*os.File, retErr error) {
        port, err := pa.allocator.RequestPortsInRange(addrs, proto.String(), portStart, portEnd)
        if err != nil {
                return 0, nil, err
        }
        defer func() {
                if retErr != nil {
                        for _, addr := range addrs {
                                pa.allocator.ReleasePort(addr, proto.String(), port)
                        }
                }
        }()

        var boundSocks []*os.File
        defer func() {
                if retErr != nil {
                        for i, sock := range boundSocks {
                                if err := sock.Close(); err != nil {
                                        log.G(context.TODO()).WithFields(log.Fields{
                                                "addr": addrs[i],
                                                "port": port,
                                        }).WithError(err).Warnf("failed to close socket during port allocation")
                                }
                        }
                }
        }()

        for _, addr := range addrs {
                addr, _ := netip.AddrFromSlice(addr)
                addrPort := netip.AddrPortFrom(addr.Unmap(), uint16(port))

                var sock *os.File
                switch proto {
                case types.TCP:
                        sock, err = bindTCPOrUDP(addrPort, syscall.SOCK_STREAM, syscall.IPPROTO_TCP)
                case types.UDP:
                        sock, err = bindTCPOrUDP(addrPort, syscall.SOCK_DGRAM, syscall.IPPROTO_UDP)
                case types.SCTP:
                        sock, err = bindSCTP(addrPort)
                default:
                        return 0, nil, fmt.Errorf("protocol %s not supported", proto)
                }

                if err != nil {
                        return 0, nil, err
                }

                boundSocks = append(boundSocks, sock)
        }

        return port, boundSocks, nil
}

// ReleasePorts releases a common port reserved for a list of addrs. It doesn't
// close the sockets bound by [RequestPortsInRange]. This must be taken care of
// independently by the caller.
func (pa OSAllocator) ReleasePorts(addrs []net.IP, proto types.Protocol, port int) {
        for _, addr := range addrs {
                pa.allocator.ReleasePort(addr, proto.String(), port)
        }
}

func bindTCPOrUDP(addr netip.AddrPort, typ int, proto types.Protocol) (_ *os.File, retErr error) {
        var domain int
        var sa syscall.Sockaddr
        if addr.Addr().Unmap().Is4() {
                domain = syscall.AF_INET
                sa = &syscall.SockaddrInet4{Addr: addr.Addr().As4(), Port: int(addr.Port())}
        } else {
                domain = syscall.AF_INET6
                sa = &syscall.SockaddrInet6{Addr: addr.Addr().Unmap().As16(), Port: int(addr.Port())}
        }

        sd, err := syscall.Socket(domain, typ|syscall.SOCK_CLOEXEC, int(proto))
        if err != nil {
                return nil, fmt.Errorf("failed to create socket for %s/%s: %w", addr, proto, err)
        }
        defer func() {
                if retErr != nil {
                        syscall.Close(sd)
                }
        }()

        if err := syscall.SetsockoptInt(sd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR, 1); err != nil {
                return nil, fmt.Errorf("failed to setsockopt(SO_REUSEADDR) for %s/%s: %w", addr, proto, err)
        }

        if domain == syscall.AF_INET6 {
                syscall.SetsockoptInt(sd, syscall.IPPROTO_IPV6, syscall.IPV6_V6ONLY, 1)
        }
        if typ == syscall.SOCK_DGRAM {
                // Enable IP_PKTINFO for UDP sockets to get the destination address.
                // The destination address will be used as the source address when
                // sending back replies coming from the container.
                lvl := syscall.IPPROTO_IP
                opt := syscall.IP_PKTINFO
                optName := "IP_PKTINFO"
                if domain == syscall.AF_INET6 {
                        lvl = syscall.IPPROTO_IPV6
                        opt = syscall.IPV6_RECVPKTINFO
                        optName = "IPV6_RECVPKTINFO"
                }
                if err := syscall.SetsockoptInt(sd, lvl, opt, 1); err != nil {
                        return nil, fmt.Errorf("failed to setsockopt(%s) for %s/%s: %w", optName, addr, proto, err)
                }
        }
        if err := syscall.Bind(sd, sa); err != nil {
                return nil, fmt.Errorf("failed to bind host port %s/%s: %w", addr, proto, err)
        }

        boundSocket := os.NewFile(uintptr(sd), "listener")
        if boundSocket == nil {
                return nil, fmt.Errorf("failed to convert socket to file for %s/%s", addr, proto)
        }
        return boundSocket, nil
}

// bindSCTP is based on sctp.ListenSCTP. The socket is created and bound, but
// does not start listening.
func bindSCTP(addr netip.AddrPort) (_ *os.File, retErr error) {
        domain := syscall.AF_INET
        if addr.Addr().Unmap().Is6() {
                domain = syscall.AF_INET6
        }

        sd, err := syscall.Socket(domain, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, syscall.IPPROTO_SCTP)
        if err != nil {
                return nil, fmt.Errorf("failed to create socket for %s/sctp: %w", addr, err)
        }
        defer func() {
                if retErr != nil {
                        syscall.Close(sd)
                }
        }()

        if domain == syscall.AF_INET6 {
                syscall.SetsockoptInt(sd, syscall.IPPROTO_IPV6, syscall.IPV6_V6ONLY, 1)
        }

        if errno := setSCTPInitMsg(sd, sctp.InitMsg{NumOstreams: sctp.SCTP_MAX_STREAM}); errno != 0 {
                return nil, errno
        }

        if err := sctp.SCTPBind(sd,
                &sctp.SCTPAddr{IPAddrs: []net.IPAddr{{IP: addr.Addr().Unmap().AsSlice()}}, Port: int(addr.Port())},
                sctp.SCTP_BINDX_ADD_ADDR); err != nil {
                return nil, fmt.Errorf("failed to bind host port %s/sctp: %w", addr, err)
        }

        boundSocket := os.NewFile(uintptr(sd), "listener")
        if boundSocket == nil {
                return nil, fmt.Errorf("failed to convert socket %s/sctp", addr)
        }
        return boundSocket, nil
}

package portallocator

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/netip"
        "sync"

        "github.com/containerd/log"
)

type ipMapping map[netip.Addr]protoMap

var (
        // errAllPortsAllocated is returned when no more ports are available
        errAllPortsAllocated = errors.New("all ports are allocated")
        // errUnknownProtocol is returned when an unknown protocol was specified
        errUnknownProtocol = errors.New("unknown protocol")
        once               sync.Once
        instance           *PortAllocator
)

// alreadyAllocatedErr is the returned error information when a requested port is already being used
type alreadyAllocatedErr struct {
        ip   string
        port int
}

// Error is the implementation of error.Error interface
func (e alreadyAllocatedErr) Error() string {
        return fmt.Sprintf("Bind for %s:%d failed: port is already allocated", e.ip, e.port)
}

type (
        // PortAllocator manages the transport ports database
        PortAllocator struct {
                mutex     sync.Mutex
                defaultIP net.IP
                ipMap     ipMapping
                begin     int
                end       int
        }
        portRange struct {
                begin int
                end   int
                last  int
        }
        portMap struct {
                p            map[int]struct{}
                defaultRange string
                portRanges   map[string]*portRange
        }
        protoMap map[string]*portMap
)

// GetPortRange returns the PortAllocator's default port range.
//
// This function is for internal use in tests, and must not be used
// for other purposes.
func GetPortRange() (start, end uint16) {
        p := Get()
        return uint16(p.begin), uint16(p.end)
}

// Get returns the PortAllocator
func Get() *PortAllocator {
        // Port Allocator is a singleton
        once.Do(func() {
                instance = newInstance()
        })
        return instance
}

func newInstance() *PortAllocator {
        begin, end := dynamicPortRange()
        return &PortAllocator{
                ipMap:     makeIpMapping(begin, end),
                defaultIP: net.IPv4zero,
                begin:     begin,
                end:       end,
        }
}

func dynamicPortRange() (start, end int) {
        begin, end, err := getDynamicPortRange()
        if err != nil {
                log.G(context.TODO()).WithError(err).Infof("falling back to default port range %d-%d", defaultPortRangeStart, defaultPortRangeEnd)
                return defaultPortRangeStart, defaultPortRangeEnd
        }
        return begin, end
}

func makeIpMapping(begin, end int) ipMapping {
        return ipMapping{
                netip.IPv4Unspecified(): makeProtoMap(begin, end),
                netip.IPv6Unspecified(): makeProtoMap(begin, end),
        }
}

func makeProtoMap(begin, end int) protoMap {
        return protoMap{
                "tcp":  newPortMap(begin, end),
                "udp":  newPortMap(begin, end),
                "sctp": newPortMap(begin, end),
        }
}

// RequestPort requests new port from global ports pool for specified ip and proto.
// If port is 0 it returns first free port. Otherwise it checks port availability
// in proto's pool and returns that port or error if port is already busy.
func (p *PortAllocator) RequestPort(ip net.IP, proto string, port int) (int, error) {
        if ip == nil {
                ip = p.defaultIP // FIXME(thaJeztah): consider making this a required argument and producing an error instead, or set default when constructing.
        }
        return p.RequestPortsInRange([]net.IP{ip}, proto, port, port)
}

// RequestPortInRange is equivalent to [PortAllocator.RequestPortsInRange] with
// a single IP address. If ip is nil, a port is instead requested for the
// default IP (0.0.0.0).
func (p *PortAllocator) RequestPortInRange(ip net.IP, proto string, portStart, portEnd int) (int, error) {
        if ip == nil {
                ip = p.defaultIP // FIXME(thaJeztah): consider making this a required argument and producing an error instead, or set default when constructing.
        }
        return p.RequestPortsInRange([]net.IP{ip}, proto, portStart, portEnd)
}

// RequestPortsInRange requests new ports from the global ports pool, for proto and each of ips.
// If portStart and portEnd are 0 it returns the first free port in the default ephemeral range.
// If portStart != portEnd it returns the first free port in the requested range.
// Otherwise, (portStart == portEnd) it checks port availability in the requested proto's port-pool
// and returns that port or error if port is already busy.
func (p *PortAllocator) RequestPortsInRange(ips []net.IP, proto string, portStart, portEnd int) (int, error) {
        if proto != "tcp" && proto != "udp" && proto != "sctp" {
                return 0, errUnknownProtocol
        }
        if portStart != 0 || portEnd != 0 {
                // Validate custom port-range
                if portStart == 0 || portEnd == 0 || portEnd < portStart {
                        return 0, fmt.Errorf("invalid port range: %d-%d", portStart, portEnd)
                }
        }
        if len(ips) == 0 {
                return 0, errors.New("no IP addresses specified")
        }

        p.mutex.Lock()
        defer p.mutex.Unlock()

        // Collect the portMap for the required proto and each of the IP addresses.
        // If there's a new IP address, create portMap objects for each of the protocols
        // and collect the one that's needed for this request.
        // Mark these portMap objects as needing port allocations.
        type portMapRef struct {
                portMap  *portMap
                allocate bool
        }
        ipToPortMapRef := map[netip.Addr]*portMapRef{}
        var ips4, ips6 bool
        for _, ip := range ips {
                addr, ok := netip.AddrFromSlice(ip)
                if !ok {
                        return 0, fmt.Errorf("invalid IP address: %s", ip)
                }
                addr = addr.Unmap()
                if addr.Is4() {
                        ips4 = true
                } else {
                        ips6 = true
                }
                // Make sure addr -> protoMap[proto] -> portMap exists.
                if _, ok := p.ipMap[addr]; !ok {
                        p.ipMap[addr] = makeProtoMap(p.begin, p.end)
                }
                // Remember the protoMap[proto] portMap, it needs the port allocation.
                ipToPortMapRef[addr] = &portMapRef{
                        portMap:  p.ipMap[addr][proto],
                        allocate: true,
                }
        }

        // If ips includes an unspecified address, the port needs to be free in all ipMaps
        // for that address family. Otherwise, the port needs only needs to be free in the
        // per-address maps for ips, and the map for 0.0.0.0/::.
        //
        // Collect the additional portMaps where the port needs to be free, but
        // don't mark them as needing port allocation.
        for _, unspecAddr := range []netip.Addr{netip.IPv4Unspecified(), netip.IPv6Unspecified()} {
                if _, ok := ipToPortMapRef[unspecAddr]; ok {
                        for addr, ipm := range p.ipMap {
                                if unspecAddr.Is4() == addr.Is4() {
                                        if _, ok := ipToPortMapRef[addr]; !ok {
                                                ipToPortMapRef[addr] = &portMapRef{portMap: ipm[proto]}
                                        }
                                }
                        }
                } else if (unspecAddr.Is4() && ips4) || (unspecAddr.Is6() && ips6) {
                        ipToPortMapRef[unspecAddr] = &portMapRef{portMap: p.ipMap[unspecAddr][proto]}
                }
        }

        // Handle a request for a specific port.
        if portStart > 0 && portStart == portEnd {
                for addr, pMap := range ipToPortMapRef {
                        if _, allocated := pMap.portMap.p[portStart]; allocated {
                                return 0, alreadyAllocatedErr{ip: addr.String(), port: portStart}
                        }
                }
                for _, pMap := range ipToPortMapRef {
                        if pMap.allocate {
                                pMap.portMap.p[portStart] = struct{}{}
                        }
                }
                return portStart, nil
        }

        // Handle a request for a port range.

        // Create/fetch ranges for each portMap.
        pRanges := map[netip.Addr]*portRange{}
        for addr, pMap := range ipToPortMapRef {
                pRanges[addr] = pMap.portMap.getPortRange(portStart, portEnd)
        }

        // Arbitrarily starting after the last port allocated for the first address, search
        // for a port that's available in all ranges.
        firstAddr, _ := netip.AddrFromSlice(ips[0])
        firstRange := pRanges[firstAddr.Unmap()]
        port := firstRange.last
        for i := firstRange.begin; i <= firstRange.end; i++ {
                port++
                if port > firstRange.end {
                        port = firstRange.begin
                }

                portAlreadyAllocated := func() bool {
                        for _, pMap := range ipToPortMapRef {
                                if _, ok := pMap.portMap.p[port]; ok {
                                        return true
                                }
                        }
                        return false
                }

                if !portAlreadyAllocated() {
                        for addr, pMap := range ipToPortMapRef {
                                if pMap.allocate {
                                        pMap.portMap.p[port] = struct{}{}
                                        pRanges[addr].last = port
                                }
                        }
                        return port, nil
                }
        }
        return 0, errAllPortsAllocated
}

// ReleasePort releases port from global ports pool for specified ip and proto.
func (p *PortAllocator) ReleasePort(ip net.IP, proto string, port int) {
        p.mutex.Lock()
        defer p.mutex.Unlock()

        if ip == nil {
                ip = p.defaultIP // FIXME(thaJeztah): consider making this a required argument and producing an error instead, or set default when constructing.
        }
        addr, ok := netip.AddrFromSlice(ip)
        if !ok {
                return
        }
        protomap, ok := p.ipMap[addr.Unmap()]
        if !ok {
                return
        }
        delete(protomap[proto].p, port)
}

// ReleaseAll releases all ports for all ips.
func (p *PortAllocator) ReleaseAll() {
        begin, end := dynamicPortRange()
        p.mutex.Lock()
        p.ipMap = makeIpMapping(begin, end)
        p.mutex.Unlock()
}

func getRangeKey(portStart, portEnd int) string {
        return fmt.Sprintf("%d-%d", portStart, portEnd)
}

func newPortRange(portStart, portEnd int) *portRange {
        return &portRange{
                begin: portStart,
                end:   portEnd,
                last:  portEnd,
        }
}

func newPortMap(portStart, portEnd int) *portMap {
        defaultKey := getRangeKey(portStart, portEnd)
        return &portMap{
                p:            map[int]struct{}{},
                defaultRange: defaultKey,
                portRanges: map[string]*portRange{
                        defaultKey: newPortRange(portStart, portEnd),
                },
        }
}

func (pm *portMap) getPortRange(portStart, portEnd int) *portRange {
        var key string
        if portStart == 0 && portEnd == 0 {
                key = pm.defaultRange
        } else {
                key = getRangeKey(portStart, portEnd)
        }

        // Return existing port range, if already known.
        if pr, exists := pm.portRanges[key]; exists {
                return pr
        }

        // Otherwise create a new port range.
        pr := newPortRange(portStart, portEnd)
        pm.portRanges[key] = pr
        return pr
}

package portallocator

import (
        "bufio"
        "fmt"
        "os"
)

func getDynamicPortRange() (start int, end int, _ error) {
        const portRangeKernelParam = "/proc/sys/net/ipv4/ip_local_port_range"
        file, err := os.Open(portRangeKernelParam)
        if err != nil {
                return 0, 0, err
        }
        defer file.Close()

        n, err := fmt.Fscanf(bufio.NewReader(file), "%d\t%d", &start, &end)
        if n != 2 || err != nil {
                if err == nil {
                        err = fmt.Errorf("unexpected count of parsed numbers (%d)", n)
                }
                return 0, 0, fmt.Errorf("port allocator - failed to parse system ephemeral port range from %s: %v", portRangeKernelParam, err)
        }
        return start, end, nil
}

//go:build linux && !386

package portallocator

import (
        "syscall"
        "unsafe"

        "github.com/ishidawataru/sctp"
)

func setSCTPInitMsg(sd int, options sctp.InitMsg) syscall.Errno {
        _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT,
                uintptr(sd),
                sctp.SOL_SCTP,
                sctp.SCTP_INITMSG,
                uintptr(unsafe.Pointer(&options)), // #nosec G103 -- Ignore "G103: Use of unsafe calls should be audited"
                unsafe.Sizeof(options),
                0)
        return errno
}

package portmapper

import (
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "os/exec"
        "runtime"
        "strconv"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
)

// StartProxy starts the proxy process at proxyPath.
// If listenSock is not nil, it must be a bound socket that can be passed to
// the proxy process for it to listen on.
func StartProxy(pb types.PortBinding,
        proxyPath string,
        listenSock *os.File,
) (stop func() error, retErr error) {
        if proxyPath == "" {
                return nil, errors.New("no path provided for userland-proxy binary")
        }
        r, w, err := os.Pipe()
        if err != nil {
                return nil, fmt.Errorf("proxy unable to open os.Pipe %s", err)
        }
        defer func() {
                if w != nil {
                        w.Close()
                }
                r.Close()
        }()

        cmd := &exec.Cmd{
                Path: proxyPath,
                Args: []string{
                        proxyPath,
                        "-proto", pb.Proto.String(),
                        "-host-ip", pb.HostIP.String(),
                        "-host-port", strconv.FormatUint(uint64(pb.HostPort), 10),
                        "-container-ip", pb.IP.String(),
                        "-container-port", strconv.FormatUint(uint64(pb.Port), 10),
                },
                ExtraFiles: []*os.File{w},
                SysProcAttr: &syscall.SysProcAttr{
                        Pdeathsig: syscall.SIGTERM, // send a sigterm to the proxy if the creating thread in the daemon process dies (https://go.dev/issue/27505)
                },
        }
        if listenSock != nil {
                cmd.Args = append(cmd.Args, "-use-listen-fd")
                cmd.ExtraFiles = append(cmd.ExtraFiles, listenSock)
        }

        wait := make(chan error, 1)

        // As p.cmd.SysProcAttr.Pdeathsig is set, the signal will be sent to the
        // process when the OS thread on which p.cmd.Start() was executed dies.
        // If the thread is allowed to be released back into the goroutine
        // thread pool, the thread could get terminated at any time if a
        // goroutine gets scheduled onto it which calls runtime.LockOSThread()
        // and exits without a matching number of runtime.UnlockOSThread()
        // calls. Ensure that the thread from which Start() is called stays
        // alive until the proxy or the daemon process exits to prevent the
        // proxy from getting terminated early. See https://go.dev/issue/27505
        // for more details.
        started := make(chan error)
        var stopped atomic.Bool
        go func() {
                runtime.LockOSThread()
                defer runtime.UnlockOSThread()
                err := cmd.Start()
                started <- err
                if err != nil {
                        return
                }
                err = cmd.Wait()
                if !stopped.Load() {
                        log.G(context.Background()).WithFields(log.Fields{
                                "proto":          pb.Proto,
                                "host-ip":        pb.HostIP,
                                "host-port":      pb.HostPort,
                                "container-ip":   pb.IP,
                                "container-port": pb.Port,
                        }).Info("Userland proxy exited early (this is expected during daemon shutdown)")
                }
                wait <- err
        }()
        if err := <-started; err != nil {
                return nil, err
        }
        w.Close()
        w = nil

        errchan := make(chan error, 1)
        go func() {
                buf := make([]byte, 2)
                r.Read(buf)

                if string(buf) != "0\n" {
                        errStr, err := io.ReadAll(r)
                        if err != nil {
                                errchan <- fmt.Errorf("error reading exit status from userland proxy: %v", err)
                                return
                        }
                        // If the user has an old docker-proxy in their PATH, and we passed "-use-listen-fd"
                        // on the command line, it exits with no response on the pipe.
                        if listenSock != nil && buf[0] == 0 && len(errStr) == 0 {
                                errchan <- errors.New("failed to start docker-proxy, check that the current version is in your $PATH")
                                return
                        }
                        errchan <- fmt.Errorf("error starting userland proxy: %s", errStr)
                        return
                }
                errchan <- nil
        }()

        select {
        case err := <-errchan:
                if err != nil {
                        return nil, err
                }
        case <-time.After(16 * time.Second):
                return nil, errors.New("timed out starting the userland proxy")
        }

        stopFn := func() error {
                if cmd.Process == nil {
                        return nil
                }
                stopped.Store(true)
                if err := cmd.Process.Signal(os.Interrupt); err != nil {
                        return err
                }
                return <-wait
        }
        return stopFn, nil
}

package portmapperapi

import (
        "context"
        "net"
        "net/netip"
        "os"
        "strings"

        "github.com/docker/docker/daemon/libnetwork/types"
)

// Registerer provides a callback interface for registering port-mappers.
type Registerer interface {
        // Register provides a way for port-mappers to dynamically register with libnetwork.
        Register(name string, driver PortMapper) error
}

// PortMapper maps / unmaps container ports to host ports.
type PortMapper interface {
        // MapPorts takes a list of port binding requests, and returns a list of
        // PortBinding. Both lists MUST have the same size.
        //
        // Multiple port bindings are passed when they're all requesting the
        // same port range, or an ephemeral port, over multiple IP addresses and
        // all pointing to the same container port. In that case, the PortMapper
        // MUST assign the same HostPort for all IP addresses.
        //
        // When an ephemeral port, or a single port from a range is requested
        // MapPorts should attempt a few times to find a free port available
        // across all IP addresses.
        MapPorts(ctx context.Context, reqs []PortBindingReq, fwn Firewaller) ([]PortBinding, error)

        // UnmapPorts takes a list of port bindings to unmap.
        UnmapPorts(ctx context.Context, pbs []PortBinding, fwn Firewaller) error
}

type PortBindingReq struct {
        types.PortBinding
        // Mapper is the name of the port mapper used to process this PortBindingReq.
        Mapper string
        // ChildHostIP is a temporary field used to pass the host IP address as
        // seen from the daemon. (It'll be removed once the portmapper API is
        // implemented).
        ChildHostIP net.IP `json:"-"`
}

// Compare defines an ordering over PortBindingReq such that bindings that
// differ only in host IP are adjacent (those bindings should be allocated the
// same port).
//
// Port bindings are first sorted by their mapper, then:
//   - exact host ports are placed before ranges (in case exact ports fall within
//     ranges, giving a better chance of allocating the exact ports), then
//   - same container port are adjacent (lowest ports first), then
//   - same protocols are adjacent (tcp < udp < sctp), then
//   - same host ports or ranges are adjacent, then
//   - ordered by container IP (then host IP, if set).
func (pbReq PortBindingReq) Compare(other PortBindingReq) int {
        if pbReq.Mapper != other.Mapper {
                return strings.Compare(pbReq.Mapper, other.Mapper)
        }
        // Exact host port < host port range.
        aIsRange := pbReq.HostPort == 0 || pbReq.HostPort != pbReq.HostPortEnd
        bIsRange := other.HostPort == 0 || other.HostPort != other.HostPortEnd
        if aIsRange != bIsRange {
                if aIsRange {
                        return 1
                }
                return -1
        }
        if pbReq.Port != other.Port {
                return int(pbReq.Port) - int(other.Port)
        }
        if pbReq.Proto != other.Proto {
                return int(pbReq.Proto) - int(other.Proto)
        }
        if pbReq.HostPort != other.HostPort {
                return int(pbReq.HostPort) - int(other.HostPort)
        }
        if pbReq.HostPortEnd != other.HostPortEnd {
                return int(pbReq.HostPortEnd) - int(other.HostPortEnd)
        }
        aHostIP, _ := netip.AddrFromSlice(pbReq.HostIP)
        bHostIP, _ := netip.AddrFromSlice(other.HostIP)
        if c := aHostIP.Unmap().Compare(bHostIP.Unmap()); c != 0 {
                return c
        }
        aIP, _ := netip.AddrFromSlice(pbReq.IP)
        bIP, _ := netip.AddrFromSlice(other.IP)
        return aIP.Unmap().Compare(bIP.Unmap())
}

type PortBinding struct {
        types.PortBinding
        // Mapper is the name of the port mapper used to process this PortBinding.
        Mapper string
        // BoundSocket is used to reserve a host port for the binding. If the
        // userland proxy is in-use, it's passed to the proxy when the proxy is
        // started, then it's closed and set to nil here.
        BoundSocket *os.File `json:"-"`
        // ChildHostIP is the host IP address, as seen from the daemon. This
        // is normally the same as PortBinding.HostIP but, in rootless mode, it
        // will be an address in the rootless network namespace. RootlessKit
        // binds the port on the real (parent) host address and maps it to the
        // same port number on the address dockerd sees in the child namespace.
        // So, for example, docker-proxy and DNAT rules need to use the child
        // namespace's host address. (PortBinding.HostIP isn't replaced by the
        // child address, because it's stored as user-config and the child
        // address may change if RootlessKit is configured differently.)
        ChildHostIP net.IP `json:"-"`
        // PortDriverRemove is a function that will inform the RootlessKit
        // port driver about removal of a port binding, or nil.
        PortDriverRemove func() error `json:"-"`
        // StopProxy is a function to stop the userland proxy for this binding,
        // if a proxy has been started - else nil.
        StopProxy func() error `json:"-"`
        // RootlesskitUnsupported is set to true when the port binding is not
        // supported by the port driver of RootlessKit.
        RootlesskitUnsupported bool `json:"-"`
}

// ChildPortBinding is pb.PortBinding, with the host address the daemon
// will see - which, in rootless mode, will be an address in the RootlessKit's
// child namespace (see PortBinding.ChildHostIP).
func (pb PortBinding) ChildPortBinding() types.PortBinding {
        res := pb.PortBinding
        res.HostIP = pb.ChildHostIP
        return res
}

package nat

import (
        "context"
        "errors"
        "fmt"
        "net"
        "net/netip"
        "os"
        "strconv"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/rlkclient"
        "github.com/docker/docker/daemon/libnetwork/portallocator"
        "github.com/docker/docker/daemon/libnetwork/portmapperapi"
        "github.com/docker/docker/daemon/libnetwork/types"
)

const (
        driverName              = "nat"
        maxAllocatePortAttempts = 10
)

type PortDriverClient interface {
        ChildHostIP(hostIP netip.Addr) netip.Addr
        AddPort(ctx context.Context, proto string, hostIP, childIP netip.Addr, hostPort int) (func() error, error)
}

type proxyStarter func(types.PortBinding, *os.File) (func() error, error)

// Register the "nat" port-mapper with libnetwork.
func Register(r portmapperapi.Registerer, cfg Config) error {
        return r.Register(driverName, NewPortMapper(cfg))
}

type PortMapper struct {
        // pdc is used to interact with rootlesskit port driver.
        pdc         PortDriverClient
        startProxy  proxyStarter
        enableProxy bool
}

type Config struct {
        // RlkClient is called by MapPorts to determine the ChildHostIP and ask
        // rootlesskit to map ports in its netns.
        RlkClient   PortDriverClient
        StartProxy  proxyStarter
        EnableProxy bool
}

func NewPortMapper(cfg Config) PortMapper {
        return PortMapper{
                pdc:         cfg.RlkClient,
                startProxy:  cfg.StartProxy,
                enableProxy: cfg.EnableProxy,
        }
}

// MapPorts allocates and binds host ports for the given cfg. The caller is
// responsible for ensuring that all entries in cfg map the same proto,
// container port, and host port range (their host addresses must differ).
func (pm PortMapper) MapPorts(ctx context.Context, cfg []portmapperapi.PortBindingReq, fwn portmapperapi.Firewaller) ([]portmapperapi.PortBinding, error) {
        if len(cfg) == 0 {
                return nil, nil
        }
        // Ensure that all of cfg's entries have the same proto and ports.
        proto, port, hostPort, hostPortEnd := cfg[0].Proto, cfg[0].Port, cfg[0].HostPort, cfg[0].HostPortEnd
        for _, c := range cfg[1:] {
                if c.Proto != proto || c.Port != port || c.HostPort != hostPort || c.HostPortEnd != hostPortEnd {
                        return nil, types.InternalErrorf("port binding mismatch %d/%s:%d-%d, %d/%s:%d-%d",
                                port, proto, hostPort, hostPortEnd,
                                port, c.Proto, c.HostPort, c.HostPortEnd)
                }
        }

        // Try up to maxAllocatePortAttempts times to get a port that's not already allocated.
        var bindings []portmapperapi.PortBinding
        var err error
        for i := 0; i < maxAllocatePortAttempts; i++ {
                bindings, err = pm.attemptBindHostPorts(ctx, cfg, proto, hostPort, hostPortEnd, fwn)
                if err == nil {
                        break
                }
                // There is no point in immediately retrying to map an explicitly chosen port.
                if hostPort != 0 && hostPort == hostPortEnd {
                        log.G(ctx).WithError(err).Warnf("Failed to allocate and map port")
                        return nil, err
                }
                log.G(ctx).WithFields(log.Fields{
                        "error":   err,
                        "attempt": i + 1,
                }).Warn("Failed to allocate and map port")
        }

        if err != nil {
                // If the retry budget is exhausted and no free port could be found, return
                // the latest error.
                return nil, err
        }

        // Start userland proxy processes.
        if pm.enableProxy {
                for i := range bindings {
                        if bindings[i].BoundSocket == nil || bindings[i].RootlesskitUnsupported || bindings[i].StopProxy != nil {
                                continue
                        }
                        var err error
                        bindings[i].StopProxy, err = pm.startProxy(
                                bindings[i].ChildPortBinding(), bindings[i].BoundSocket,
                        )
                        if err != nil {
                                return nil, fmt.Errorf("failed to start userland proxy for port mapping %s: %w",
                                        bindings[i].PortBinding, err)
                        }
                        if err := bindings[i].BoundSocket.Close(); err != nil {
                                log.G(ctx).WithFields(log.Fields{
                                        "error":   err,
                                        "mapping": bindings[i].PortBinding,
                                }).Warnf("failed to close proxy socket")
                        }
                        bindings[i].BoundSocket = nil
                }
        }

        return bindings, nil
}

func (pm PortMapper) UnmapPorts(ctx context.Context, pbs []portmapperapi.PortBinding, fwn portmapperapi.Firewaller) error {
        var errs []error
        for _, pb := range pbs {
                if pb.BoundSocket != nil {
                        if err := pb.BoundSocket.Close(); err != nil {
                                errs = append(errs, fmt.Errorf("failed to close socket for port mapping %s: %w", pb, err))
                        }
                }
                if pb.PortDriverRemove != nil {
                        if err := pb.PortDriverRemove(); err != nil {
                                errs = append(errs, err)
                        }
                }
                if pb.StopProxy != nil {
                        if err := pb.StopProxy(); err != nil && !errors.Is(err, os.ErrProcessDone) {
                                errs = append(errs, fmt.Errorf("failed to stop userland proxy: %w", err))
                        }
                }
        }
        if err := fwn.DelPorts(ctx, mergeChildHostIPs(pbs)); err != nil {
                errs = append(errs, err)
        }
        for _, pb := range pbs {
                portallocator.Get().ReleasePort(pb.ChildHostIP, pb.Proto.String(), int(pb.HostPort))
        }
        return errors.Join(errs...)
}

// attemptBindHostPorts allocates host ports for each NAT port mapping, and
// reserves those ports by binding them.
//
// If the allocator doesn't have an available port in the required range, or the
// port can't be bound (perhaps because another process has already bound it),
// all resources are released and an error is returned. When ports are
// successfully reserved, a PortBinding is returned for each mapping.
func (pm PortMapper) attemptBindHostPorts(
        ctx context.Context,
        cfg []portmapperapi.PortBindingReq,
        proto types.Protocol,
        hostPortStart, hostPortEnd uint16,
        fwn portmapperapi.Firewaller,
) (_ []portmapperapi.PortBinding, retErr error) {
        var err error
        var port int

        addrs := make([]net.IP, 0, len(cfg))
        for i := range cfg {
                cfg[i] = setChildHostIP(pm.pdc, cfg[i])
                addrs = append(addrs, cfg[i].ChildHostIP)
        }

        pa := portallocator.NewOSAllocator()
        port, socks, err := pa.RequestPortsInRange(addrs, proto, int(hostPortStart), int(hostPortEnd))
        if err != nil {
                return nil, err
        }
        defer func() {
                if retErr != nil {
                        pa.ReleasePorts(addrs, proto, port)
                }
        }()

        if len(socks) != len(cfg) {
                for _, sock := range socks {
                        if err := sock.Close(); err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to close socket")
                        }
                }
                return nil, types.InternalErrorf("port allocator returned %d sockets for %d port bindings", len(socks), len(cfg))
        }

        res := make([]portmapperapi.PortBinding, 0, len(cfg))
        defer func() {
                if retErr != nil {
                        if err := pm.UnmapPorts(ctx, res, fwn); err != nil {
                                log.G(ctx).WithFields(log.Fields{
                                        "pbs":   res,
                                        "error": err,
                                }).Warn("Failed to release port bindings")
                        }
                }
        }()

        for i := range cfg {
                pb := portmapperapi.PortBinding{
                        PortBinding: cfg[i].PortBinding.GetCopy(),
                        BoundSocket: socks[i],
                        ChildHostIP: cfg[i].ChildHostIP,
                }
                pb.PortBinding.HostPort = uint16(port)
                pb.PortBinding.HostPortEnd = pb.HostPort
                res = append(res, pb)
        }

        if err := configPortDriver(ctx, res, pm.pdc); err != nil {
                return nil, err
        }
        if err := fwn.AddPorts(ctx, mergeChildHostIPs(res)); err != nil {
                return nil, err
        }
        // Now the firewall rules are set up, it's safe to listen on the socket. (Listening
        // earlier could result in dropped connections if the proxy becomes unreachable due
        // to NAT rules sending packets directly to the container.)
        //
        // If not starting the proxy, nothing will ever accept a connection on the
        // socket. Listen here anyway because SO_REUSEADDR is set, so bind() won't notice
        // the problem if a port's bound to both INADDR_ANY and a specific address. (Also
        // so the binding shows up in "netstat -at".)
        if err := listenBoundPorts(res, pm.enableProxy); err != nil {
                return nil, err
        }
        return res, nil
}

func setChildHostIP(pdc PortDriverClient, req portmapperapi.PortBindingReq) portmapperapi.PortBindingReq {
        if pdc == nil {
                req.ChildHostIP = req.HostIP
                return req
        }
        hip, _ := netip.AddrFromSlice(req.HostIP)
        req.ChildHostIP = pdc.ChildHostIP(hip).AsSlice()
        return req
}

// mergeChildHostIPs take a slice of PortBinding and returns a slice of
// types.PortBinding, where the HostIP in each of the results has the
// value of ChildHostIP from the input (if present).
func mergeChildHostIPs(pbs []portmapperapi.PortBinding) []types.PortBinding {
        res := make([]types.PortBinding, 0, len(pbs))
        for _, b := range pbs {
                pb := b.PortBinding
                if b.ChildHostIP != nil {
                        pb.HostIP = b.ChildHostIP
                }
                res = append(res, pb)
        }
        return res
}

// configPortDriver passes the port binding's details to rootlesskit, and updates the
// port binding with callbacks to remove the rootlesskit config (or marks the binding as
// unsupported by rootlesskit).
func configPortDriver(ctx context.Context, pbs []portmapperapi.PortBinding, pdc PortDriverClient) error {
        for i := range pbs {
                b := pbs[i]
                if pdc != nil && b.HostPort != 0 {
                        var err error
                        hip, ok := netip.AddrFromSlice(b.HostIP)
                        if !ok {
                                return fmt.Errorf("invalid host IP address in %s", b)
                        }
                        chip, ok := netip.AddrFromSlice(b.ChildHostIP)
                        if !ok {
                                return fmt.Errorf("invalid child host IP address %s in %s", b.ChildHostIP, b)
                        }
                        pbs[i].PortDriverRemove, err = pdc.AddPort(ctx, b.Proto.String(), hip, chip, int(b.HostPort))
                        if err != nil {
                                var pErr *rlkclient.ProtocolUnsupportedError
                                if errors.As(err, &pErr) {
                                        log.G(ctx).WithFields(log.Fields{
                                                "error": pErr,
                                        }).Warnf("discarding request for %q", net.JoinHostPort(hip.String(), strconv.Itoa(int(b.HostPort))))
                                        pbs[i].RootlesskitUnsupported = true
                                        continue
                                }
                                return err
                        }
                }
        }
        return nil
}

func listenBoundPorts(pbs []portmapperapi.PortBinding, proxyEnabled bool) error {
        for i := range pbs {
                if pbs[i].BoundSocket == nil || pbs[i].RootlesskitUnsupported || pbs[i].Proto == types.UDP {
                        continue
                }
                rc, err := pbs[i].BoundSocket.SyscallConn()
                if err != nil {
                        return fmt.Errorf("raw conn not available on %d socket: %w", pbs[i].Proto, err)
                }
                if errC := rc.Control(func(fd uintptr) {
                        somaxconn := 0
                        // SCTP sockets do not support somaxconn=0
                        if proxyEnabled || pbs[i].Proto == types.SCTP {
                                somaxconn = -1 // silently capped to "/proc/sys/net/core/somaxconn"
                        }
                        err = syscall.Listen(int(fd), somaxconn)
                }); errC != nil {
                        return fmt.Errorf("failed to Control %s socket: %w", pbs[i].Proto, err)
                }
                if err != nil {
                        return fmt.Errorf("failed to listen on %s socket: %w", pbs[i].Proto, err)
                }
        }
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package routed

import (
        "context"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/portmapperapi"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/sliceutil"
)

const driverName = "routed"

// Register the "routed" port-mapper with libnetwork.
func Register(r portmapperapi.Registerer) error {
        return r.Register(driverName, NewPortMapper())
}

type PortMapper struct{}

func NewPortMapper() PortMapper {
        return PortMapper{}
}

// MapPorts sets up firewall rules to allow direct remote access to pbs.
func (pm PortMapper) MapPorts(ctx context.Context, reqs []portmapperapi.PortBindingReq, fwn portmapperapi.Firewaller) ([]portmapperapi.PortBinding, error) {
        if len(reqs) == 0 {
                return nil, nil
        }

        res := make([]portmapperapi.PortBinding, 0, len(reqs))
        bindings := make([]types.PortBinding, 0, len(reqs))
        for _, c := range reqs {
                pb := portmapperapi.PortBinding{PortBinding: c.GetCopy()}
                if pb.HostPort != 0 || pb.HostPortEnd != 0 {
                        log.G(ctx).WithFields(log.Fields{"mapping": pb}).Infof(
                                "Host port ignored, because NAT is disabled")
                        pb.HostPort = 0
                        pb.HostPortEnd = 0
                }
                res = append(res, pb)
                bindings = append(bindings, pb.PortBinding)
        }

        if err := fwn.AddPorts(ctx, bindings); err != nil {
                return nil, err
        }

        return res, nil
}

// UnmapPorts removes firewall rules allowing direct remote access to the pbs.
func (pm PortMapper) UnmapPorts(ctx context.Context, pbs []portmapperapi.PortBinding, fwn portmapperapi.Firewaller) error {
        return fwn.DelPorts(ctx, sliceutil.Map(pbs, func(pb portmapperapi.PortBinding) types.PortBinding {
                return pb.PortBinding
        }))
}

// Package resolvconf provides utility code to query and update DNS configuration in /etc/resolv.conf
package resolvconf

import (
        "bytes"
        "fmt"
        "net/netip"
        "os"

        "github.com/docker/docker/daemon/libnetwork/internal/resolvconf"
        "github.com/opencontainers/go-digest"
)

// constants for the IP address type
const (
        IP = iota // IPv4 and IPv6
        IPv4
        IPv6
)

// File contains the resolv.conf content and its hash
type File struct {
        Content []byte
        Hash    []byte
}

func Path() string {
        return resolvconf.Path()
}

// Get returns the contents of /etc/resolv.conf and its hash
func Get() (*File, error) {
        return GetSpecific(Path())
}

// GetSpecific returns the contents of the user specified resolv.conf file and its hash
func GetSpecific(path string) (*File, error) {
        resolv, err := os.ReadFile(path)
        if err != nil {
                return nil, err
        }
        hash := digest.FromBytes(resolv)
        return &File{Content: resolv, Hash: []byte(hash)}, nil
}

// FilterResolvDNS cleans up the config in resolvConf.  It has two main jobs:
//  1. It looks for localhost (127.*|::1) entries in the provided
//     resolv.conf, removing local nameserver entries, and, if the resulting
//     cleaned config has no defined nameservers left, adds default DNS entries
//  2. Given the caller provides the enable/disable state of IPv6, the filter
//     code will remove all IPv6 nameservers if it is not enabled for containers
func FilterResolvDNS(resolvConf []byte, ipv6Enabled bool) (*File, error) {
        rc, err := resolvconf.Parse(bytes.NewBuffer(resolvConf), "")
        if err != nil {
                return nil, err
        }
        rc.TransformForLegacyNw(ipv6Enabled)
        content, err := rc.Generate(false)
        if err != nil {
                return nil, err
        }
        hash := digest.FromBytes(content)
        return &File{Content: content, Hash: []byte(hash)}, nil
}

// GetNameservers returns nameservers (if any) listed in /etc/resolv.conf
func GetNameservers(resolvConf []byte, kind int) []string {
        rc, err := resolvconf.Parse(bytes.NewBuffer(resolvConf), "")
        if err != nil {
                return nil
        }
        nsAddrs := rc.NameServers()
        var nameservers []string
        for _, addr := range nsAddrs {
                if kind == IP {
                        nameservers = append(nameservers, addr.String())
                } else if kind == IPv4 && addr.Is4() {
                        nameservers = append(nameservers, addr.String())
                } else if kind == IPv6 && addr.Is6() {
                        nameservers = append(nameservers, addr.String())
                }
        }
        return nameservers
}

// GetNameserversAsPrefix returns nameservers (if any) listed in
// /etc/resolv.conf as CIDR blocks (e.g., "1.2.3.4/32")
func GetNameserversAsPrefix(resolvConf []byte) []netip.Prefix {
        rc, err := resolvconf.Parse(bytes.NewBuffer(resolvConf), "")
        if err != nil {
                return nil
        }
        nsAddrs := rc.NameServers()
        nameservers := make([]netip.Prefix, 0, len(nsAddrs))
        for _, addr := range nsAddrs {
                nameservers = append(nameservers, netip.PrefixFrom(addr, addr.BitLen()))
        }
        return nameservers
}

// GetSearchDomains returns search domains (if any) listed in /etc/resolv.conf
// If more than one search line is encountered, only the contents of the last
// one is returned.
func GetSearchDomains(resolvConf []byte) []string {
        rc, err := resolvconf.Parse(bytes.NewBuffer(resolvConf), "")
        if err != nil {
                return nil
        }
        return rc.Search()
}

// GetOptions returns options (if any) listed in /etc/resolv.conf
// If more than one options line is encountered, only the contents of the last
// one is returned.
func GetOptions(resolvConf []byte) []string {
        rc, err := resolvconf.Parse(bytes.NewBuffer(resolvConf), "")
        if err != nil {
                return nil
        }
        return rc.Options()
}

// Build generates and writes a configuration file to path containing a nameserver
// entry for every element in nameservers, a "search" entry for every element in
// dnsSearch, and an "options" entry for every element in dnsOptions. It returns
// a File containing the generated content and its (sha256) hash.
//
// Note that the resolv.conf file is written, but the hash file is not.
func Build(path string, nameservers, dnsSearch, dnsOptions []string) (*File, error) {
        var ns []netip.Addr
        for _, addr := range nameservers {
                ipAddr, err := netip.ParseAddr(addr)
                if err != nil {
                        return nil, fmt.Errorf("bad nameserver address: %w", err)
                }
                ns = append(ns, ipAddr)
        }
        rc := resolvconf.ResolvConf{}
        rc.OverrideNameServers(ns)
        rc.OverrideSearch(dnsSearch)
        rc.OverrideOptions(dnsOptions)

        content, err := rc.Generate(false)
        if err != nil {
                return nil, err
        }

        // Write the resolv.conf file - it's bind-mounted into the container, so can't
        // move a temp file into place, just have to truncate and write it.
        //
        // TODO(thaJeztah): the Build function is currently only used by BuildKit, which only uses "File.Content", and doesn't require the file to be written.
        if err := os.WriteFile(path, content, 0o644); err != nil {
                return nil, err
        }

        // TODO(thaJeztah): the Build function is currently only used by BuildKit, which does not use the Hash
        hash := digest.FromBytes(content)
        return &File{Content: content, Hash: []byte(hash)}, nil
}

package libnetwork

import (
        "context"
        "errors"
        "fmt"
        "math/rand"
        "net"
        "net/netip"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/internal/netiputil"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/miekg/dns"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/codes"
        "go.opentelemetry.io/otel/trace"
        "golang.org/x/sync/semaphore"
        "golang.org/x/time/rate"
)

// DNSBackend represents a backend DNS resolver used for DNS name
// resolution. All the queries to the resolver are forwarded to the
// backend resolver.
type DNSBackend interface {
        // ResolveName resolves a service name to an IPv4 or IPv6 address by searching
        // the networks the sandbox is connected to. The second return value will be
        // true if the name exists in docker domain, even if there are no addresses of
        // the required type. Such queries shouldn't be forwarded to external nameservers.
        ResolveName(ctx context.Context, name string, ipType int) ([]net.IP, bool)
        // ResolveIP returns the service name for the passed in IP. IP is in reverse dotted
        // notation; the format used for DNS PTR records
        ResolveIP(ctx context.Context, name string) string
        // ResolveService returns all the backend details about the containers or hosts
        // backing a service. Its purpose is to satisfy an SRV query
        ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP)
        // ExecFunc allows a function to be executed in the context of the backend
        // on behalf of the resolver.
        ExecFunc(f func()) error
        // NdotsSet queries the backends ndots dns option settings
        NdotsSet() bool
        // HandleQueryResp passes the name & IP from a response to the backend. backend
        // can use it to maintain any required state about the resolution
        HandleQueryResp(name string, ip net.IP)
}

const (
        dnsPort       = "53"
        ptrIPv4domain = ".in-addr.arpa."
        ptrIPv6domain = ".ip6.arpa."
        respTTL       = 600
        maxExtDNS     = 3 // max number of external servers to try
        extIOTimeout  = 4 * time.Second
        maxConcurrent = 1024
        logInterval   = 2 * time.Second
)

type extDNSEntry struct {
        IPStr        string
        port         uint16 // for testing
        HostLoopback bool
}

func (e extDNSEntry) String() string {
        if e.HostLoopback {
                return "host(" + e.IPStr + ")"
        }
        return e.IPStr
}

// Resolver is the embedded DNS server in Docker. It operates by listening on
// the container's loopback interface for DNS queries.
type Resolver struct {
        backend       DNSBackend
        extDNSList    [maxExtDNS]extDNSEntry // Ext servers to use when there's no entry in ipToExtDNS.
        ipToExtDNS    addrToExtDNSMap        // DNS query source IP -> ext servers.
        server        *dns.Server
        conn          *net.UDPConn
        tcpServer     *dns.Server
        tcpListen     *net.TCPListener
        err           error
        listenAddress netip.Addr
        proxyDNS      atomic.Bool
        startCh       chan struct{}
        logger        *log.Entry

        fwdSem      *semaphore.Weighted // Limit the number of concurrent external DNS requests in-flight
        logInterval rate.Sometimes      // Rate-limit logging about hitting the fwdSem limit
}

// NewResolver creates a new instance of the Resolver
func NewResolver(address string, proxyDNS bool, backend DNSBackend) *Resolver {
        r := &Resolver{
                backend:     backend,
                err:         errors.New("setup not done yet"),
                startCh:     make(chan struct{}, 1),
                fwdSem:      semaphore.NewWeighted(maxConcurrent),
                logInterval: rate.Sometimes{Interval: logInterval},
        }
        r.listenAddress, _ = netip.ParseAddr(address)
        r.proxyDNS.Store(proxyDNS)

        return r
}

type addrToExtDNSMap struct {
        mu   sync.Mutex
        eMap map[netip.Addr][maxExtDNS]extDNSEntry
}

func (am *addrToExtDNSMap) get(addr netip.Addr) ([maxExtDNS]extDNSEntry, bool) {
        am.mu.Lock()
        defer am.mu.Unlock()
        entries, ok := am.eMap[addr]
        return entries, ok
}

func (am *addrToExtDNSMap) set(addr netip.Addr, entries []extDNSEntry) {
        var e [maxExtDNS]extDNSEntry
        copy(e[:], entries)
        am.mu.Lock()
        defer am.mu.Unlock()
        if len(entries) > 0 {
                if am.eMap == nil {
                        am.eMap = map[netip.Addr][maxExtDNS]extDNSEntry{}
                }
                am.eMap[addr] = e
        } else {
                delete(am.eMap, addr)
        }
}

func (r *Resolver) log(ctx context.Context) *log.Entry {
        if r.logger == nil {
                return log.G(ctx)
        }
        return r.logger
}

// SetupFunc returns the setup function that should be run in the container's
// network namespace.
func (r *Resolver) SetupFunc(port uint16) func() {
        return func() {
                var err error

                // DNS operates primarily on UDP
                r.conn, err = net.ListenUDP("udp", net.UDPAddrFromAddrPort(
                        netip.AddrPortFrom(r.listenAddress, port)),
                )
                if err != nil {
                        r.err = fmt.Errorf("error in opening name server socket %v", err)
                        return
                }

                // Listen on a TCP as well
                r.tcpListen, err = net.ListenTCP("tcp", net.TCPAddrFromAddrPort(
                        netip.AddrPortFrom(r.listenAddress, port)),
                )
                if err != nil {
                        r.err = fmt.Errorf("error in opening name TCP server socket %v", err)
                        return
                }
                r.err = nil
        }
}

// Start starts the name server for the container.
func (r *Resolver) Start() error {
        r.startCh <- struct{}{}
        defer func() { <-r.startCh }()

        // make sure the resolver has been setup before starting
        if r.err != nil {
                return r.err
        }

        if err := r.setupNAT(context.TODO()); err != nil {
                return fmt.Errorf("setting up DNAT/SNAT rules failed: %v", err)
        }

        s := &dns.Server{Handler: dns.HandlerFunc(r.serveDNS), PacketConn: r.conn}
        r.server = s
        go func() {
                if err := s.ActivateAndServe(); err != nil {
                        r.log(context.TODO()).WithError(err).Error("[resolver] failed to start PacketConn DNS server")
                }
        }()

        tcpServer := &dns.Server{Handler: dns.HandlerFunc(r.serveDNS), Listener: r.tcpListen}
        r.tcpServer = tcpServer
        go func() {
                if err := tcpServer.ActivateAndServe(); err != nil {
                        r.log(context.TODO()).WithError(err).Error("[resolver] failed to start TCP DNS server")
                }
        }()
        return nil
}

// Stop stops the name server for the container. A stopped resolver can be
// reused after running the SetupFunc again.
func (r *Resolver) Stop() {
        r.startCh <- struct{}{}
        defer func() { <-r.startCh }()

        if r.server != nil {
                r.server.Shutdown() //nolint:errcheck
        }
        if r.tcpServer != nil {
                r.tcpServer.Shutdown() //nolint:errcheck
        }
        r.conn = nil
        r.tcpServer = nil
        r.err = errors.New("setup not done yet")
        r.fwdSem = semaphore.NewWeighted(maxConcurrent)
}

// SetExtServers configures the external nameservers the resolver should use
// when forwarding queries, unless SetExtServersForSrc has configured servers
// for the DNS client making the request.
func (r *Resolver) SetExtServers(extDNS []extDNSEntry) {
        copy(r.extDNSList[:], r.filterExtServers(extDNS))
}

// SetForwardingPolicy re-configures the embedded DNS resolver to either enable or disable forwarding DNS queries to
// external servers.
func (r *Resolver) SetForwardingPolicy(policy bool) {
        r.proxyDNS.Store(policy)
}

// SetExtServersForSrc configures the external nameservers the resolver should
// use when forwarding queries from srcAddr. If set, these servers will be used
// in preference to servers set by SetExtServers. Supplying a nil or empty extDNS
// deletes nameservers for srcAddr.
func (r *Resolver) SetExtServersForSrc(srcAddr netip.Addr, extDNS []extDNSEntry) error {
        r.ipToExtDNS.set(srcAddr, r.filterExtServers(extDNS))
        return nil
}

// NameServer returns the IP of the DNS resolver for the containers.
func (r *Resolver) NameServer() netip.Addr {
        return r.listenAddress
}

// ResolverOptions returns resolv.conf options that should be set.
func (r *Resolver) ResolverOptions() []string {
        return []string{"ndots:0"}
}

// filterExtServers removes the resolver's own address from extDNS if present,
// and returns the result.
func (r *Resolver) filterExtServers(extDNS []extDNSEntry) []extDNSEntry {
        result := make([]extDNSEntry, 0, len(extDNS))
        for _, e := range extDNS {
                if !e.HostLoopback {
                        if ra, _ := netip.ParseAddr(e.IPStr); ra == r.listenAddress {
                                log.G(context.TODO()).Infof("[resolver] not using own address (%s) as an external DNS server",
                                        r.listenAddress)
                                continue
                        }
                }
                result = append(result, e)
        }
        return result
}

//nolint:gosec // The RNG is not used in a security-sensitive context.
var (
        shuffleRNG   = rand.New(rand.NewSource(time.Now().Unix()))
        shuffleRNGMu sync.Mutex
)

func shuffleAddr(addr []net.IP) []net.IP {
        shuffleRNGMu.Lock()
        defer shuffleRNGMu.Unlock()
        for i := len(addr) - 1; i > 0; i-- {
                r := shuffleRNG.Intn(i + 1) //nolint:gosec // gosec complains about the use of rand here. It should be fine.
                addr[i], addr[r] = addr[r], addr[i]
        }
        return addr
}

func createRespMsg(query *dns.Msg) *dns.Msg {
        resp := &dns.Msg{}
        resp.SetReply(query)
        resp.RecursionAvailable = true

        return resp
}

func (r *Resolver) handleMXQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
        name := query.Question[0].Name
        addrv4, _ := r.backend.ResolveName(ctx, name, types.IPv4)
        addrv6, _ := r.backend.ResolveName(ctx, name, types.IPv6)

        if addrv4 == nil && addrv6 == nil {
                return nil, nil
        }

        // We were able to resolve the name. Respond with an empty list with
        // RcodeSuccess/NOERROR so that email clients can treat it as "implicit MX"
        // [RFC 5321 Section-5.1] and issue a Type A/AAAA query for the name.

        resp := createRespMsg(query)
        return resp, nil
}

func (r *Resolver) handleIPQuery(ctx context.Context, query *dns.Msg, ipType int) (*dns.Msg, error) {
        name := query.Question[0].Name
        addr, ok := r.backend.ResolveName(ctx, name, ipType)
        if !ok {
                return nil, nil
        }

        r.log(ctx).Debugf("[resolver] lookup for %s: IP %v", name, addr)

        resp := createRespMsg(query)
        if len(addr) > 1 {
                addr = shuffleAddr(addr)
        }
        if ipType == types.IPv4 {
                for _, ip := range addr {
                        resp.Answer = append(resp.Answer, &dns.A{
                                Hdr: dns.RR_Header{Name: name, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: respTTL},
                                A:   ip,
                        })
                }
        } else {
                for _, ip := range addr {
                        resp.Answer = append(resp.Answer, &dns.AAAA{
                                Hdr:  dns.RR_Header{Name: name, Rrtype: dns.TypeAAAA, Class: dns.ClassINET, Ttl: respTTL},
                                AAAA: ip,
                        })
                }
        }
        return resp, nil
}

func (r *Resolver) handlePTRQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
        ptr := query.Question[0].Name
        name, after, found := strings.Cut(ptr, ptrIPv4domain)
        if !found || after != "" {
                name, after, found = strings.Cut(ptr, ptrIPv6domain)
        }
        if !found || after != "" {
                // Not a known IPv4 or IPv6 PTR domain.
                // Maybe the external DNS servers know what to do with the query?
                return nil, nil
        }

        host := r.backend.ResolveIP(ctx, name)
        if host == "" {
                return nil, nil
        }

        r.log(ctx).Debugf("[resolver] lookup for IP %s: name %s", name, host)
        fqdn := dns.Fqdn(host)

        resp := createRespMsg(query)
        resp.Answer = append(resp.Answer, &dns.PTR{
                Hdr: dns.RR_Header{Name: ptr, Rrtype: dns.TypePTR, Class: dns.ClassINET, Ttl: respTTL},
                Ptr: fqdn,
        })
        return resp, nil
}

func (r *Resolver) handleSRVQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
        svc := query.Question[0].Name
        srv, ip := r.backend.ResolveService(ctx, svc)

        if len(srv) == 0 {
                return nil, nil
        }
        if len(srv) != len(ip) {
                return nil, fmt.Errorf("invalid reply for SRV query %s", svc)
        }

        resp := createRespMsg(query)

        for i, r := range srv {
                resp.Answer = append(resp.Answer, &dns.SRV{
                        Hdr:    dns.RR_Header{Name: svc, Rrtype: dns.TypePTR, Class: dns.ClassINET, Ttl: respTTL},
                        Port:   r.Port,
                        Target: r.Target,
                })
                resp.Extra = append(resp.Extra, &dns.A{
                        Hdr: dns.RR_Header{Name: r.Target, Rrtype: dns.TypeA, Class: dns.ClassINET, Ttl: respTTL},
                        A:   ip[i],
                })
        }
        return resp, nil
}

func (r *Resolver) serveDNS(w dns.ResponseWriter, query *dns.Msg) {
        var (
                resp *dns.Msg
                err  error
        )

        if query == nil || len(query.Question) == 0 {
                return
        }

        queryName := query.Question[0].Name
        queryType := query.Question[0].Qtype

        ctx, span := otel.Tracer("").Start(context.Background(), "resolver.serveDNS", trace.WithAttributes(
                attribute.String("libnet.resolver.query.name", queryName),
                attribute.String("libnet.resolver.query.type", dns.TypeToString[queryType]),
        ))
        defer span.End()

        switch queryType {
        case dns.TypeA:
                resp, err = r.handleIPQuery(ctx, query, types.IPv4)
        case dns.TypeAAAA:
                resp, err = r.handleIPQuery(ctx, query, types.IPv6)
        case dns.TypeMX:
                resp, err = r.handleMXQuery(ctx, query)
        case dns.TypePTR:
                resp, err = r.handlePTRQuery(ctx, query)
        case dns.TypeSRV:
                resp, err = r.handleSRVQuery(ctx, query)
        default:
                r.log(ctx).Debugf("[resolver] query type %s is not supported by the embedded DNS and will be forwarded to external DNS", dns.TypeToString[queryType])
        }

        reply := func(msg *dns.Msg) {
                if err = w.WriteMsg(msg); err != nil {
                        r.log(ctx).WithError(err).Error("[resolver] failed to write response")
                        span.RecordError(err)
                        span.SetStatus(codes.Error, "WriteMsg failed")
                        // Make a best-effort attempt to send a failure response to the
                        // client so it doesn't have to wait for a timeout if the failure
                        // has to do with the content of msg rather than the connection.
                        if msg.Rcode != dns.RcodeServerFailure {
                                if err := w.WriteMsg(new(dns.Msg).SetRcode(query, dns.RcodeServerFailure)); err != nil {
                                        r.log(ctx).WithError(err).Error("[resolver] writing ServFail response also failed")
                                        span.RecordError(err)
                                }
                        }
                }
        }

        if err != nil {
                r.log(ctx).WithError(err).Errorf("[resolver] failed to handle query: %s (%s)", queryName, dns.TypeToString[queryType])
                reply(new(dns.Msg).SetRcode(query, dns.RcodeServerFailure))
                return
        }

        if resp != nil {
                // We are the authoritative DNS server for this request so it's
                // on us to truncate the response message to the size limit
                // negotiated by the client.
                maxSize := dns.MinMsgSize
                if w.LocalAddr().Network() == "tcp" {
                        maxSize = dns.MaxMsgSize
                } else {
                        if optRR := query.IsEdns0(); optRR != nil {
                                if udpsize := int(optRR.UDPSize()); udpsize > maxSize {
                                        maxSize = udpsize
                                }
                        }
                }
                resp.Truncate(maxSize)
                span.AddEvent("found local record", trace.WithAttributes(
                        attribute.String("libnet.resolver.resp", resp.String()),
                ))
                reply(resp)
                return
        }

        // If the user sets ndots > 0 explicitly and the query is
        // in the root domain don't forward it out. We will return
        // failure and let the client retry with the search domain
        // attached.
        if (queryType == dns.TypeA || queryType == dns.TypeAAAA) && r.backend.NdotsSet() &&
                !strings.Contains(strings.TrimSuffix(queryName, "."), ".") {
                resp = createRespMsg(query)
        } else {
                resp = r.forwardExtDNS(ctx, w.LocalAddr().Network(), w.RemoteAddr(), query)
        }

        if resp == nil {
                // We were unable to get an answer from any of the upstream DNS
                // servers or the backend doesn't support proxying DNS requests.
                resp = new(dns.Msg).SetRcode(query, dns.RcodeServerFailure)
        }
        reply(resp)
}

const defaultPort = "53"

func (r *Resolver) dialExtDNS(proto string, server extDNSEntry) (net.Conn, error) {
        port := defaultPort
        if server.port != 0 {
                port = strconv.FormatUint(uint64(server.port), 10)
        }
        addr := net.JoinHostPort(server.IPStr, port)

        if server.HostLoopback {
                return net.DialTimeout(proto, addr, extIOTimeout)
        }

        var (
                extConn net.Conn
                dialErr error
        )
        err := r.backend.ExecFunc(func() {
                extConn, dialErr = net.DialTimeout(proto, addr, extIOTimeout)
        })
        if err != nil {
                return nil, err
        }
        if dialErr != nil {
                return nil, dialErr
        }

        return extConn, nil
}

func (r *Resolver) forwardExtDNS(ctx context.Context, proto string, remoteAddr net.Addr, query *dns.Msg) *dns.Msg {
        ctx, span := otel.Tracer("").Start(ctx, "resolver.forwardExtDNS")
        defer span.End()

        proxyDNS := r.proxyDNS.Load()
        for _, extDNS := range r.extDNS(netiputil.AddrPortFromNet(remoteAddr)) {
                if extDNS.IPStr == "" {
                        break
                }
                // If proxyDNS is false, do not forward the request from the host's namespace
                // (don't access an external DNS server from an internal network). But, it is
                // safe to make the request from the container's network namespace - it'll fail
                // if the DNS server is not accessible, but the server may be on-net.
                if !proxyDNS && extDNS.HostLoopback {
                        continue
                }

                // limits the number of outstanding concurrent queries.
                ctx, cancel := context.WithTimeout(ctx, extIOTimeout)
                err := r.fwdSem.Acquire(ctx, 1)
                cancel()

                if err != nil {
                        if errors.Is(err, context.DeadlineExceeded) {
                                r.logInterval.Do(func() {
                                        r.log(ctx).Errorf("[resolver] more than %v concurrent queries", maxConcurrent)
                                })
                        }
                        return new(dns.Msg).SetRcode(query, dns.RcodeRefused)
                }
                resp := func() *dns.Msg {
                        defer r.fwdSem.Release(1)
                        return r.exchange(ctx, proto, extDNS, query)
                }()
                if resp == nil {
                        continue
                }

                switch resp.Rcode {
                case dns.RcodeServerFailure, dns.RcodeRefused:
                        // Server returned FAILURE: continue with the next external DNS server
                        // Server returned REFUSED: this can be a transitional status, so continue with the next external DNS server
                        r.log(ctx).Debugf("[resolver] external DNS %s:%s returned failure:\n%s", proto, extDNS.IPStr, resp)
                        continue
                }
                answers := 0
                for _, rr := range resp.Answer {
                        h := rr.Header()
                        switch h.Rrtype {
                        case dns.TypeA:
                                answers++
                                ip := rr.(*dns.A).A
                                r.log(ctx).Debugf("[resolver] received A record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
                                r.backend.HandleQueryResp(h.Name, ip)
                        case dns.TypeAAAA:
                                answers++
                                ip := rr.(*dns.AAAA).AAAA
                                r.log(ctx).Debugf("[resolver] received AAAA record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
                                r.backend.HandleQueryResp(h.Name, ip)
                        }
                }
                if len(resp.Answer) == 0 {
                        r.log(ctx).Debugf("[resolver] external DNS %s:%s returned response with no answers:\n%s", proto, extDNS.IPStr, resp)
                }
                resp.Compress = true
                span.AddEvent("response from upstream server")
                return resp
        }

        span.AddEvent("no response from upstream servers")
        return nil
}

func (r *Resolver) extDNS(remoteAddr netip.AddrPort) []extDNSEntry {
        if res, ok := r.ipToExtDNS.get(remoteAddr.Addr()); ok {
                return res[:]
        }
        return r.extDNSList[:]
}

func (r *Resolver) exchange(ctx context.Context, proto string, extDNS extDNSEntry, query *dns.Msg) *dns.Msg {
        ctx, span := otel.Tracer("").Start(ctx, "resolver.exchange", trace.WithAttributes(
                attribute.String("libnet.resolver.upstream.proto", proto),
                attribute.String("libnet.resolver.upstream.address", extDNS.IPStr),
                attribute.Bool("libnet.resolver.upstream.host-loopback", extDNS.HostLoopback)))
        defer span.End()

        extConn, err := r.dialExtDNS(proto, extDNS)
        if err != nil {
                r.log(ctx).WithError(err).Warn("[resolver] connect failed")
                span.RecordError(err)
                span.SetStatus(codes.Error, "dialExtDNS failed")
                return nil
        }
        defer extConn.Close()

        logger := r.log(ctx).WithFields(log.Fields{
                "dns-server":  extConn.RemoteAddr().Network() + ":" + extConn.RemoteAddr().String(),
                "client-addr": extConn.LocalAddr().Network() + ":" + extConn.LocalAddr().String(),
                "question":    query.Question[0].String(),
        })
        logger.Debug("[resolver] forwarding query")

        resp, _, err := (&dns.Client{
                Timeout: extIOTimeout,
                // Following the robustness principle, make a best-effort
                // attempt to receive oversized response messages without
                // truncating them on our end to forward verbatim to the client.
                // Some DNS servers (e.g. Mikrotik RouterOS) don't support
                // EDNS(0) and may send replies over UDP longer than 512 bytes
                // regardless of what size limit, if any, was advertised in the
                // query message. Note that ExchangeWithConn will override this
                // value if it detects an EDNS OPT record in query so only
                // oversized replies to non-EDNS queries will benefit.
                UDPSize: dns.MaxMsgSize,
        }).ExchangeWithConn(query, &dns.Conn{Conn: extConn})
        if err != nil {
                logger.WithError(err).Error("[resolver] failed to query external DNS server")
                span.RecordError(err)
                span.SetStatus(codes.Error, "ExchangeWithConn failed")
                return nil
        }

        if resp == nil {
                // Should be impossible, so make noise if it happens anyway.
                logger.Error("[resolver] external DNS returned empty response")
                span.SetStatus(codes.Error, "External DNS returned empty response")
        }
        return resp
}

//go:build !windows

package libnetwork

import (
        "context"
        "fmt"
        "net"

        "github.com/docker/docker/daemon/libnetwork/internal/nftables"
        "github.com/docker/docker/daemon/libnetwork/iptables"
)

const (
        // output chain used for docker embedded DNS resolver
        outputChain = "DOCKER_OUTPUT"
        // postrouting chain used for docker embedded DNS resolver
        postroutingChain = "DOCKER_POSTROUTING"
)

func (r *Resolver) setupNAT(ctx context.Context) error {
        if r.err != nil {
                return r.err
        }
        laddr := r.conn.LocalAddr().String()
        ltcpaddr := r.tcpListen.Addr().String()
        resolverIP, ipPort, _ := net.SplitHostPort(laddr)
        _, tcpPort, _ := net.SplitHostPort(ltcpaddr)

        if nftables.Enabled() {
                return r.setupNftablesNAT(ctx, laddr, ltcpaddr, resolverIP, ipPort, tcpPort)
        }
        return r.setupIptablesNAT(laddr, ltcpaddr, resolverIP, ipPort, tcpPort)
}

func (r *Resolver) setupIptablesNAT(laddr, ltcpaddr, resolverIP, ipPort, tcpPort string) error {
        rules := [][]string{
                {"-t", "nat", "-I", outputChain, "-d", resolverIP, "-p", "udp", "--dport", dnsPort, "-j", "DNAT", "--to-destination", laddr},
                {"-t", "nat", "-I", postroutingChain, "-s", resolverIP, "-p", "udp", "--sport", ipPort, "-j", "SNAT", "--to-source", ":" + dnsPort},
                {"-t", "nat", "-I", outputChain, "-d", resolverIP, "-p", "tcp", "--dport", dnsPort, "-j", "DNAT", "--to-destination", ltcpaddr},
                {"-t", "nat", "-I", postroutingChain, "-s", resolverIP, "-p", "tcp", "--sport", tcpPort, "-j", "SNAT", "--to-source", ":" + dnsPort},
        }

        var setupErr error
        err := r.backend.ExecFunc(func() {
                // TODO IPv6 support
                iptable := iptables.GetIptable(iptables.IPv4)

                // insert outputChain and postroutingchain
                if iptable.ExistsNative("nat", "OUTPUT", "-d", resolverIP, "-j", outputChain) {
                        if err := iptable.RawCombinedOutputNative("-t", "nat", "-F", outputChain); err != nil {
                                setupErr = err
                                return
                        }
                } else {
                        if err := iptable.RawCombinedOutputNative("-t", "nat", "-N", outputChain); err != nil {
                                setupErr = err
                                return
                        }
                        if err := iptable.RawCombinedOutputNative("-t", "nat", "-I", "OUTPUT", "-d", resolverIP, "-j", outputChain); err != nil {
                                setupErr = err
                                return
                        }
                }

                if iptable.ExistsNative("nat", "POSTROUTING", "-d", resolverIP, "-j", postroutingChain) {
                        if err := iptable.RawCombinedOutputNative("-t", "nat", "-F", postroutingChain); err != nil {
                                setupErr = err
                                return
                        }
                } else {
                        if err := iptable.RawCombinedOutputNative("-t", "nat", "-N", postroutingChain); err != nil {
                                setupErr = err
                                return
                        }
                        if err := iptable.RawCombinedOutputNative("-t", "nat", "-I", "POSTROUTING", "-d", resolverIP, "-j", postroutingChain); err != nil {
                                setupErr = err
                                return
                        }
                }

                for _, rule := range rules {
                        if iptable.RawCombinedOutputNative(rule...) != nil {
                                setupErr = fmt.Errorf("set up rule failed, %v", rule)
                                return
                        }
                }
        })
        if err != nil {
                return err
        }
        return setupErr
}

func (r *Resolver) setupNftablesNAT(ctx context.Context, laddr, ltcpaddr, resolverIP, ipPort, tcpPort string) error {
        table, err := nftables.NewTable(nftables.IPv4, "docker-dns")
        if err != nil {
                return err
        }

        dnatChain, err := table.BaseChain(ctx, "dns-dnat", nftables.BaseChainTypeNAT, nftables.BaseChainHookOutput, nftables.BaseChainPriorityDstNAT)
        if err != nil {
                return err
        }
        if err := dnatChain.AppendRule(ctx, 0, "ip daddr %s udp dport %s counter dnat to %s", resolverIP, dnsPort, laddr); err != nil {
                return err
        }
        if err := dnatChain.AppendRule(ctx, 0, "ip daddr %s tcp dport %s counter dnat to %s", resolverIP, dnsPort, ltcpaddr); err != nil {
                return err
        }

        snatChain, err := table.BaseChain(ctx, "dns-snat", nftables.BaseChainTypeNAT, nftables.BaseChainHookPostrouting, nftables.BaseChainPrioritySrcNAT)
        if err != nil {
                return err
        }
        if err := snatChain.AppendRule(ctx, 0, "ip saddr %s udp sport %s counter snat to :%s", resolverIP, ipPort, dnsPort); err != nil {
                return err
        }
        if err := snatChain.AppendRule(ctx, 0, "ip saddr %s tcp sport %s counter snat to :%s", resolverIP, tcpPort, dnsPort); err != nil {
                return err
        }

        var setupErr error
        if err := r.backend.ExecFunc(func() {
                setupErr = table.Apply(ctx)
        }); err != nil {
                return err
        }
        return setupErr
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package libnetwork

import (
        "context"
        "encoding/json"
        "fmt"
        "net"
        "slices"
        "sort"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/etchosts"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/libnetwork/types"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

// SandboxOption is an option setter function type used to pass various options to
// NewNetContainer method. The various setter functions of type SandboxOption are
// provided by libnetwork, they look like ContainerOptionXXXX(...)
type SandboxOption func(sb *Sandbox)

func (sb *Sandbox) processOptions(options ...SandboxOption) {
        for _, opt := range options {
                if opt != nil {
                        opt(sb)
                }
        }
}

// Sandbox provides the control over the network container entity.
// It is a one to one mapping with the container.
type Sandbox struct {
        id                 string
        containerID        string
        config             containerConfig
        extDNS             []extDNSEntry
        osSbox             *osl.Namespace
        controller         *Controller
        resolver           *Resolver
        resolverOnce       sync.Once
        endpoints          []*Endpoint
        epPriority         map[string]int
        populatedEndpoints map[string]struct{}
        joinLeaveMu        sync.Mutex
        dbIndex            uint64
        dbExists           bool
        isStub             bool
        inDelete           bool
        ingress            bool
        ndotsSet           bool
        oslTypes           []osl.SandboxType // slice of properties of this sandbox
        loadBalancerNID    string            // NID that this SB is a load balancer for
        mu                 sync.Mutex
        // This mutex is used to serialize service related operation for an endpoint
        // The lock is here because the endpoint is saved into the store so is not unique
        service sync.Mutex
}

// These are the container configs used to customize container /etc/hosts file.
type hostsPathConfig struct {
        hostName        string
        domainName      string
        hostsPath       string
        originHostsPath string
        extraHosts      []extraHost
}

type extraHost struct {
        name string
        IP   string
}

// These are the container configs used to customize container /etc/resolv.conf file.
type resolvConfPathConfig struct {
        resolvConfPath       string
        originResolvConfPath string
        resolvConfHashFile   string
        dnsList              []string
        dnsSearchList        []string
        dnsOptionsList       []string
}

type containerConfig struct {
        hostsPathConfig
        resolvConfPathConfig
        generic           map[string]any
        useDefaultSandBox bool
        useExternalKey    bool
        exposedPorts      []types.TransportPort
}

// ID returns the ID of the sandbox.
func (sb *Sandbox) ID() string {
        return sb.id
}

// ContainerID returns the container id associated to this sandbox.
func (sb *Sandbox) ContainerID() string {
        return sb.containerID
}

// Key returns the sandbox's key.
func (sb *Sandbox) Key() string {
        if sb.config.useDefaultSandBox {
                return osl.GenerateKey("default")
        }
        return osl.GenerateKey(sb.id)
}

// Labels returns the sandbox's labels.
func (sb *Sandbox) Labels() map[string]any {
        sb.mu.Lock()
        defer sb.mu.Unlock()
        opts := make(map[string]any, len(sb.config.generic))
        for k, v := range sb.config.generic {
                opts[k] = v
        }
        return opts
}

// Delete destroys this container after detaching it from all connected endpoints.
func (sb *Sandbox) Delete(ctx context.Context) error {
        return sb.delete(ctx, false)
}

func (sb *Sandbox) delete(ctx context.Context, force bool) error {
        sb.mu.Lock()
        if sb.inDelete {
                sb.mu.Unlock()
                return types.ForbiddenErrorf("another sandbox delete in progress")
        }
        // Set the inDelete flag. This will ensure that we don't
        // update the store until we have completed all the endpoint
        // leaves and deletes. And when endpoint leaves and deletes
        // are completed then we can finally delete the sandbox object
        // altogether from the data store. If the daemon exits
        // ungracefully in the middle of a sandbox delete this way we
        // will have all the references to the endpoints in the
        // sandbox so that we can clean them up when we restart
        sb.inDelete = true
        sb.mu.Unlock()

        c := sb.controller

        // Detach from all endpoints
        retain := false
        for _, ep := range sb.Endpoints() {
                // gw network endpoint detach and removal are automatic
                if ep.endpointInGWNetwork() && !force {
                        continue
                }
                // Retain the sandbox if we can't obtain the network from store.
                if _, err := c.getNetworkFromStore(ep.getNetwork().ID()); err != nil {
                        if !c.isSwarmNode() {
                                retain = true
                        }
                        log.G(ctx).Warnf("Failed getting network for ep %s during sandbox %s delete: %v", ep.ID(), sb.ID(), err)
                        continue
                }

                if !force {
                        if err := ep.Leave(context.WithoutCancel(ctx), sb); err != nil {
                                log.G(ctx).Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
                        }
                }

                if err := ep.Delete(context.WithoutCancel(ctx), force); err != nil {
                        log.G(ctx).Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
                }
        }

        if retain {
                sb.mu.Lock()
                sb.inDelete = false
                sb.mu.Unlock()
                return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id)
        }
        // Container is going away. Path cache in etchosts is most
        // likely not required any more. Drop it.
        etchosts.Drop(sb.config.hostsPath)

        if sb.resolver != nil {
                sb.resolver.Stop()
        }

        if sb.osSbox != nil && !sb.config.useDefaultSandBox {
                if err := sb.osSbox.Destroy(); err != nil {
                        log.G(ctx).WithError(err).Warn("error destroying network sandbox")
                }
        }

        if err := sb.storeDelete(); err != nil {
                log.G(ctx).Warnf("Failed to delete sandbox %s from store: %v", sb.ID(), err)
        }

        c.mu.Lock()
        if sb.ingress {
                c.ingressSandbox = nil
        }
        delete(c.sandboxes, sb.ID())
        c.mu.Unlock()

        return nil
}

// Rename changes the name of all attached Endpoints.
func (sb *Sandbox) Rename(name string) error {
        var err error

        for _, ep := range sb.Endpoints() {
                if ep.endpointInGWNetwork() {
                        continue
                }

                oldName := ep.Name()
                lEp := ep
                if err = ep.rename(name); err != nil {
                        break
                }

                defer func() {
                        if err != nil {
                                if err2 := lEp.rename(oldName); err2 != nil {
                                        log.G(context.TODO()).WithField("old", oldName).WithField("origError", err).WithError(err2).Error("error renaming sandbox")
                                }
                        }
                }()
        }

        return err
}

// Refresh leaves all the endpoints, resets and re-applies the options,
// re-joins all the endpoints without destroying the osl sandbox
func (sb *Sandbox) Refresh(ctx context.Context, options ...SandboxOption) error {
        // Store connected endpoints
        epList := sb.Endpoints()

        // Detach from all endpoints
        for _, ep := range epList {
                if err := ep.Leave(context.WithoutCancel(ctx), sb); err != nil {
                        log.G(ctx).Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
                }
        }

        // Re-apply options
        sb.config = containerConfig{}
        sb.processOptions(options...)

        // Setup discovery files
        if err := sb.setupResolutionFiles(ctx); err != nil {
                return err
        }

        // Re-connect to all endpoints
        for _, ep := range epList {
                if err := ep.Join(context.WithoutCancel(ctx), sb); err != nil {
                        log.G(ctx).Warnf("Failed attach sandbox %s to endpoint %s: %v\n", sb.ID(), ep.ID(), err)
                }
        }

        return nil
}

func (sb *Sandbox) UpdateLabels(labels map[string]any) {
        if sb.config.generic == nil {
                sb.config.generic = make(map[string]any, len(labels))
        }
        for k, v := range labels {
                sb.config.generic[k] = v
        }
}

func (sb *Sandbox) MarshalJSON() ([]byte, error) {
        sb.mu.Lock()
        defer sb.mu.Unlock()

        // We are just interested in the container ID. This can be expanded to include all of containerInfo if there is a need
        return json.Marshal(sb.id)
}

func (sb *Sandbox) UnmarshalJSON(b []byte) (err error) {
        sb.mu.Lock()
        defer sb.mu.Unlock()

        var id string
        if err := json.Unmarshal(b, &id); err != nil {
                return err
        }
        sb.id = id
        return nil
}

// Endpoints returns all the endpoints connected to the sandbox.
func (sb *Sandbox) Endpoints() []*Endpoint {
        sb.mu.Lock()
        defer sb.mu.Unlock()

        eps := make([]*Endpoint, len(sb.endpoints))
        copy(eps, sb.endpoints)

        return eps
}

func (sb *Sandbox) addEndpoint(ep *Endpoint) {
        sb.mu.Lock()
        defer sb.mu.Unlock()

        l := len(sb.endpoints)
        i := sort.Search(l, func(j int) bool {
                return ep.Less(sb.endpoints[j])
        })

        sb.endpoints = append(sb.endpoints, nil)
        copy(sb.endpoints[i+1:], sb.endpoints[i:])
        sb.endpoints[i] = ep
}

func (sb *Sandbox) removeEndpoint(ep *Endpoint) {
        sb.mu.Lock()
        defer sb.mu.Unlock()

        sb.removeEndpointRaw(ep)
}

func (sb *Sandbox) removeEndpointRaw(ep *Endpoint) {
        for i, e := range sb.endpoints {
                if e == ep {
                        sb.endpoints = append(sb.endpoints[:i], sb.endpoints[i+1:]...)
                        return
                }
        }
}

func (sb *Sandbox) GetEndpoint(id string) *Endpoint {
        sb.mu.Lock()
        defer sb.mu.Unlock()

        for _, ep := range sb.endpoints {
                if ep.id == id {
                        return ep
                }
        }

        return nil
}

func (sb *Sandbox) HandleQueryResp(name string, ip net.IP) {
        for _, ep := range sb.Endpoints() {
                n := ep.getNetwork()
                n.HandleQueryResp(name, ip)
        }
}

func (sb *Sandbox) ResolveIP(ctx context.Context, ip string) string {
        var svc string
        log.G(ctx).Debugf("IP To resolve %v", ip)

        for _, ep := range sb.Endpoints() {
                n := ep.getNetwork()
                svc = n.ResolveIP(ctx, ip)
                if svc != "" {
                        return svc
                }
        }

        return svc
}

// ResolveService returns all the backend details about the containers or hosts
// backing a service. Its purpose is to satisfy an SRV query.
func (sb *Sandbox) ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP) {
        log.G(ctx).Debugf("Service name To resolve: %v", name)

        // There are DNS implementations that allow SRV queries for names not in
        // the format defined by RFC 2782. Hence specific validations checks are
        // not done
        if parts := strings.SplitN(name, ".", 3); len(parts) < 3 {
                return nil, nil
        }

        for _, ep := range sb.Endpoints() {
                n := ep.getNetwork()

                srv, ip := n.ResolveService(ctx, name)
                if len(srv) > 0 {
                        return srv, ip
                }
        }
        return nil, nil
}

func (sb *Sandbox) ResolveName(ctx context.Context, name string, ipType int) ([]net.IP, bool) {
        // Embedded server owns the docker network domain. Resolution should work
        // for both container_name and container_name.network_name
        // We allow '.' in service name and network name. For a name a.b.c.d the
        // following have to tried;
        // {a.b.c.d in the networks container is connected to}
        // {a.b.c in network d},
        // {a.b in network c.d},
        // {a in network b.c.d},

        log.G(ctx).Debugf("Name To resolve: %v", name)
        name = strings.TrimSuffix(name, ".")
        reqName := []string{name}
        networkName := []string{""}

        if strings.Contains(name, ".") {
                var i int
                dup := name
                for {
                        if i = strings.LastIndex(dup, "."); i == -1 {
                                break
                        }
                        networkName = append(networkName, name[i+1:])
                        reqName = append(reqName, name[:i])

                        dup = dup[:i]
                }
        }

        epList := sb.Endpoints()

        // In swarm mode, services with exposed ports are connected to user overlay
        // network, ingress network and docker_gwbridge networks. Name resolution
        // should prioritize returning the VIP/IPs on user overlay network.
        //
        // Re-order the endpoints based on the network-type they're attached to;
        //
        //  1. dynamic networks (user overlay networks)
        //  2. ingress network(s)
        //  3. local networks ("docker_gwbridge")
        if sb.controller.isSwarmNode() {
                sort.Sort(ByNetworkType(epList))
        }

        for i := 0; i < len(reqName); i++ {
                // First check for local container alias
                if ip, ok := sb.resolveName(ctx, reqName[i], networkName[i], epList, true, ipType); ok {
                        return ip, true
                }
                // Resolve the actual container name
                if ip, ok := sb.resolveName(ctx, reqName[i], networkName[i], epList, false, ipType); ok {
                        return ip, true
                }
        }
        return nil, false
}

func (sb *Sandbox) resolveName(ctx context.Context, nameOrAlias string, networkName string, epList []*Endpoint, lookupAlias bool, ipType int) ([]net.IP, bool) {
        ctx, span := otel.Tracer("").Start(ctx, "Sandbox.resolveName", trace.WithAttributes(
                attribute.String("libnet.resolver.name-or-alias", nameOrAlias),
                attribute.String("libnet.network.name", networkName),
                attribute.Bool("libnet.resolver.alias-lookup", lookupAlias),
                attribute.Int("libnet.resolver.ip-family", ipType)))
        defer span.End()

        for _, ep := range epList {
                if lookupAlias && len(ep.aliases) == 0 {
                        continue
                }

                nw := ep.getNetwork()
                if networkName != "" && networkName != nw.Name() {
                        continue
                }

                name := nameOrAlias
                if lookupAlias {
                        ep.mu.Lock()
                        alias, ok := ep.aliases[nameOrAlias]
                        ep.mu.Unlock()
                        if !ok {
                                continue
                        }
                        name = alias
                } else {
                        // If it is a regular lookup and if the requested name is an alias
                        // don't perform a svc lookup for this endpoint.
                        ep.mu.Lock()
                        _, ok := ep.aliases[nameOrAlias]
                        ep.mu.Unlock()
                        if ok {
                                continue
                        }
                }

                ip, ok := nw.ResolveName(ctx, name, ipType)
                if ok {
                        return ip, true
                }
        }
        return nil, false
}

// hasExternalAccess returns true if any of sb's Endpoints appear to have external
// network access.
func (sb *Sandbox) hasExternalAccess() bool {
        for _, ep := range sb.Endpoints() {
                nw := ep.getNetwork()
                if nw.Internal() || nw.Type() == "null" || nw.Type() == "host" {
                        continue
                }
                if v4, v6 := ep.hasGatewayOrDefaultRoute(); v4 || v6 {
                        return true
                }
        }
        return false
}

// EnableService makes a managed container's service available by adding the
// endpoint to the service load balancer and service discovery.
func (sb *Sandbox) EnableService() (err error) {
        log.G(context.TODO()).Debugf("EnableService %s START", sb.containerID)
        defer func() {
                if err != nil {
                        if err2 := sb.DisableService(); err2 != nil {
                                log.G(context.TODO()).WithError(err2).WithField("origError", err).Error("Error while disabling service after original error")
                        }
                }
        }()
        for _, ep := range sb.Endpoints() {
                if !ep.isServiceEnabled() {
                        if err := ep.addServiceInfoToCluster(sb); err != nil {
                                return fmt.Errorf("could not update state for endpoint %s into cluster: %v", ep.Name(), err)
                        }
                        ep.enableService()
                }
        }
        log.G(context.TODO()).Debugf("EnableService %s DONE", sb.containerID)
        return nil
}

// DisableService removes a managed container's endpoints from the load balancer
// and service discovery.
func (sb *Sandbox) DisableService() (err error) {
        log.G(context.TODO()).Debugf("DisableService %s START", sb.containerID)
        failedEps := []string{}
        defer func() {
                if len(failedEps) > 0 {
                        err = fmt.Errorf("failed to disable service on sandbox:%s, for endpoints %s", sb.ID(), strings.Join(failedEps, ","))
                }
        }()
        for _, ep := range sb.Endpoints() {
                if ep.isServiceEnabled() {
                        if err := ep.deleteServiceInfoFromCluster(sb, false, "DisableService"); err != nil {
                                failedEps = append(failedEps, ep.Name())
                                log.G(context.TODO()).Warnf("failed update state for endpoint %s into cluster: %v", ep.Name(), err)
                        }
                        ep.disableService()
                }
        }
        log.G(context.TODO()).Debugf("DisableService %s DONE", sb.containerID)
        return nil
}

func (sb *Sandbox) clearNetworkResources(origEp *Endpoint) error {
        ep := sb.GetEndpoint(origEp.id)
        if ep == nil {
                return fmt.Errorf("could not find the sandbox endpoint data for endpoint %s",
                        origEp.id)
        }

        sb.mu.Lock()
        osSbox := sb.osSbox
        inDelete := sb.inDelete
        sb.mu.Unlock()
        if osSbox != nil {
                releaseOSSboxResources(osSbox, ep)
        }

        sb.mu.Lock()
        delete(sb.populatedEndpoints, ep.ID())

        if len(sb.endpoints) == 0 {
                // sb.endpoints should never be empty and this is unexpected error condition
                // We log an error message to note this down for debugging purposes.
                log.G(context.TODO()).Errorf("No endpoints in sandbox while trying to remove endpoint %s", ep.Name())
                sb.mu.Unlock()
                return nil
        }

        if !slices.Contains(sb.endpoints, ep) {
                log.G(context.TODO()).Warnf("Endpoint %s has already been deleted", ep.Name())
                sb.mu.Unlock()
                return nil
        }

        gwepBefore4, gwepBefore6 := selectGatewayEndpoint(sb.endpoints)
        sb.removeEndpointRaw(ep)
        gwepAfter4, gwepAfter6 := selectGatewayEndpoint(sb.endpoints)
        delete(sb.epPriority, ep.ID())

        sb.mu.Unlock()

        if (gwepAfter4 != nil && gwepBefore4 != gwepAfter4) || (gwepAfter6 != nil && gwepBefore6 != gwepAfter6) {
                if err := sb.updateGateway(gwepAfter4, gwepAfter6); err != nil {
                        return fmt.Errorf("updating gateway endpoint: %w", err)
                }
        }

        // Only update the store if we did not come here as part of
        // sandbox delete. If we came here as part of delete then do
        // not bother updating the store. The sandbox object will be
        // deleted anyway
        if !inDelete {
                return sb.storeUpdate(context.TODO())
        }

        return nil
}

// Less defines an ordering over endpoints, with better candidates for the default
// gateway sorted first.
//
//        <=> Returns true if a < b, false if a > b and advances to next level if a == b
//        ep.prio <=> epj.prio           # 2 < 1
//        ep.gw <=> epj.gw               # non-gw < gw
//        ep.internal <=> epj.internal   # non-internal < internal
//        ep.hasGw <=> epj.hasGw         # (gw4 and gw6) < (gw4 or gw6) < (no gw)
//        ep.name <=> epj.name           # bar < foo
func (ep *Endpoint) Less(epj *Endpoint) bool {
        sbi, _ := ep.getSandbox()
        sbj, _ := epj.getSandbox()

        // Prio defaults to 0
        var prioi, prioj int
        if sbi != nil {
                prioi = sbi.epPriority[ep.ID()]
        }
        if sbj != nil {
                prioj = sbj.epPriority[epj.ID()]
        }
        if prioi != prioj {
                return prioi > prioj
        }

        gwNeti := ep.endpointInGWNetwork()
        gwNetj := epj.endpointInGWNetwork()
        if gwNeti != gwNetj {
                return gwNetj
        }

        inti := ep.getNetwork().Internal()
        intj := epj.getNetwork().Internal()
        if inti != intj {
                return intj
        }

        gwCount := func(ep *Endpoint) int {
                gw4, gw6 := ep.hasGatewayOrDefaultRoute()
                if gw4 && gw6 {
                        return 2
                }
                if gw4 || gw6 {
                        return 1
                }
                return 0
        }
        gwCounti := gwCount(ep)
        gwCountj := gwCount(epj)
        if gwCounti != gwCountj {
                return gwCounti > gwCountj
        }

        return ep.network.Name() < epj.network.Name()
}

func (sb *Sandbox) NdotsSet() bool {
        return sb.ndotsSet
}

//go:build !windows

package libnetwork

import (
        "context"
        "fmt"
        "io/fs"
        "net/netip"
        "os"
        "path/filepath"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/etchosts"
        "github.com/docker/docker/daemon/libnetwork/internal/resolvconf"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/errdefs"
        "github.com/pkg/errors"
        "go.opentelemetry.io/otel"
)

const (
        defaultPrefix = "/var/lib/docker/network/files"
        dirPerm       = 0o755
        filePerm      = 0o644

        resolverIPSandbox = "127.0.0.11"
)

// AddHostsEntry adds an entry to /etc/hosts.
func (sb *Sandbox) AddHostsEntry(ctx context.Context, name, ip string) error {
        sb.config.extraHosts = append(sb.config.extraHosts, extraHost{name: name, IP: ip})
        return sb.rebuildHostsFile(ctx)
}

// UpdateHostsEntry updates the IP address in a /etc/hosts entry where the
// name matches the regular expression regexp.
func (sb *Sandbox) UpdateHostsEntry(regexp, ip string) error {
        return etchosts.Update(sb.config.hostsPath, ip, regexp)
}

// rebuildHostsFile builds the container's /etc/hosts file, based on the current
// state of the Sandbox (including extra hosts). If called after the container
// namespace has been created, before the user process is started, the container's
// support for IPv6 can be determined and IPv6 hosts will be included/excluded
// accordingly.
func (sb *Sandbox) rebuildHostsFile(ctx context.Context) error {
        var ifaceIPs []netip.Addr
        for _, ep := range sb.Endpoints() {
                ifaceIPs = append(ifaceIPs, ep.getEtcHostsAddrs()...)
        }
        if err := sb.buildHostsFile(ctx, ifaceIPs); err != nil {
                return errdefs.System(err)
        }
        return nil
}

func (sb *Sandbox) startResolver(restore bool) {
        sb.resolverOnce.Do(func() {
                var err error
                // The resolver is started with proxyDNS=false if the sandbox does not currently
                // have a gateway. So, if the Sandbox is only connected to an 'internal' network,
                // it will not forward DNS requests to external resolvers. The resolver's
                // proxyDNS setting is then updated as network Endpoints are added/removed.
                sb.resolver = NewResolver(resolverIPSandbox, sb.hasExternalAccess(), sb)
                defer func() {
                        if err != nil {
                                sb.resolver = nil
                        }
                }()

                // In the case of live restore container is already running with
                // right resolv.conf contents created before. Just update the
                // external DNS servers from the restored sandbox for embedded
                // server to use.
                if !restore {
                        err = sb.rebuildDNS()
                        if err != nil {
                                log.G(context.TODO()).Errorf("Updating resolv.conf failed for container %s, %q", sb.ContainerID(), err)
                                return
                        }
                }
                sb.resolver.SetExtServers(sb.extDNS)

                if err = sb.osSbox.InvokeFunc(sb.resolver.SetupFunc(0)); err != nil {
                        log.G(context.TODO()).Errorf("Resolver Setup function failed for container %s, %q", sb.ContainerID(), err)
                        return
                }

                if err = sb.resolver.Start(); err != nil {
                        log.G(context.TODO()).Errorf("Resolver Start failed for container %s, %q", sb.ContainerID(), err)
                }
        })
}

func (sb *Sandbox) setupResolutionFiles(ctx context.Context) error {
        _, span := otel.Tracer("").Start(ctx, "libnetwork.Sandbox.setupResolutionFiles")
        defer span.End()

        // Create a hosts file that can be mounted during container setup. For most
        // networking modes (not host networking) it will be re-created before the
        // container start, once its support for IPv6 is known.
        if sb.config.hostsPath == "" {
                sb.config.hostsPath = defaultPrefix + "/" + sb.id + "/hosts"
        }
        dir, _ := filepath.Split(sb.config.hostsPath)
        if err := createBasePath(dir); err != nil {
                return err
        }
        if err := sb.buildHostsFile(ctx, nil); err != nil {
                return err
        }

        return sb.setupDNS()
}

func (sb *Sandbox) buildHostsFile(ctx context.Context, ifaceIPs []netip.Addr) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.buildHostsFile")
        defer span.End()

        sb.restoreHostsPath()

        dir, _ := filepath.Split(sb.config.hostsPath)
        if err := createBasePath(dir); err != nil {
                return err
        }

        // This is for the host mode networking. If extra hosts are supplied, even though
        // it's host-networking, the container's hosts file is not based on the host's -
        // so that it's possible to override a hostname that's in the host's hosts file.
        // See analysis of how this came about in:
        // https://github.com/moby/moby/pull/48823#issuecomment-2461777129
        if sb.config.useDefaultSandBox && len(sb.config.extraHosts) == 0 {
                // We are working under the assumption that the origin file option had been properly expressed by the upper layer
                // if not here we are going to error out
                if err := copyFile(sb.config.originHostsPath, sb.config.hostsPath); err != nil && !os.IsNotExist(err) {
                        return types.InternalErrorf("could not copy source hosts file %s to %s: %v", sb.config.originHostsPath, sb.config.hostsPath, err)
                }
                return nil
        }

        extraContent := make([]etchosts.Record, 0, len(sb.config.extraHosts)+len(ifaceIPs))
        for _, host := range sb.config.extraHosts {
                addr, err := netip.ParseAddr(host.IP)
                if err != nil {
                        return errdefs.InvalidParameter(fmt.Errorf("could not parse extra host IP %s: %v", host.IP, err))
                }
                extraContent = append(extraContent, etchosts.Record{Hosts: host.name, IP: addr})
        }
        extraContent = append(extraContent, sb.makeHostsRecs(ifaceIPs)...)

        // Assume IPv6 support, unless it's definitely disabled.
        if en, ok := sb.IPv6Enabled(); ok && !en {
                return etchosts.BuildNoIPv6(sb.config.hostsPath, extraContent)
        }
        return etchosts.Build(sb.config.hostsPath, extraContent)
}

func (sb *Sandbox) makeHostsRecs(ifaceIPs []netip.Addr) []etchosts.Record {
        if len(ifaceIPs) == 0 {
                return nil
        }

        // User might have provided a FQDN in hostname or split it across hostname
        // and domainname.  We want the FQDN and the bare hostname.
        hosts := sb.config.hostName
        if sb.config.domainName != "" {
                hosts += "." + sb.config.domainName
        }

        if hn, _, ok := strings.Cut(hosts, "."); ok {
                hosts += " " + hn
        }

        var recs []etchosts.Record
        for _, ip := range ifaceIPs {
                recs = append(recs, etchosts.Record{Hosts: hosts, IP: ip})
        }
        return recs
}

func (sb *Sandbox) addHostsEntries(ctx context.Context, ifaceAddrs []netip.Addr) {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.addHostsEntries")
        defer span.End()

        // Assume IPv6 support, unless it's definitely disabled.
        if en, ok := sb.IPv6Enabled(); ok && !en {
                var filtered []netip.Addr
                for _, addr := range ifaceAddrs {
                        if !addr.Is6() {
                                filtered = append(filtered, addr)
                        }
                }
                ifaceAddrs = filtered
        }
        if err := etchosts.Add(sb.config.hostsPath, sb.makeHostsRecs(ifaceAddrs)); err != nil {
                log.G(context.TODO()).Warnf("Failed adding service host entries to the running container: %v", err)
        }
}

func (sb *Sandbox) deleteHostsEntries(ifaceAddrs []netip.Addr) {
        if err := etchosts.Delete(sb.config.hostsPath, sb.makeHostsRecs(ifaceAddrs)); err != nil {
                log.G(context.TODO()).Warnf("Failed deleting service host entries to the running container: %v", err)
        }
}

func (sb *Sandbox) restoreResolvConfPath() {
        if sb.config.resolvConfPath == "" {
                sb.config.resolvConfPath = defaultPrefix + "/" + sb.id + "/resolv.conf"
        }
        sb.config.resolvConfHashFile = sb.config.resolvConfPath + ".hash"
}

func (sb *Sandbox) restoreHostsPath() {
        if sb.config.hostsPath == "" {
                sb.config.hostsPath = defaultPrefix + "/" + sb.id + "/hosts"
        }
}

func (sb *Sandbox) setExternalResolvers(entries []resolvconf.ExtDNSEntry) {
        if len(entries) == 0 {
                log.G(context.TODO()).WithField("cid", sb.ContainerID()).Warn("DNS resolver has no external nameservers")
                sb.extDNS = nil
                return
        }
        sb.extDNS = make([]extDNSEntry, 0, len(entries))
        for _, entry := range entries {
                sb.extDNS = append(sb.extDNS, extDNSEntry{
                        IPStr:        entry.Addr.String(),
                        HostLoopback: entry.HostLoopback,
                })
        }
}

func (c *containerConfig) getOriginResolvConfPath() string {
        if c.originResolvConfPath != "" {
                return c.originResolvConfPath
        }
        // Fallback if not specified.
        return resolvconf.Path()
}

// loadResolvConf reads the resolv.conf file at path, and merges in overrides for
// nameservers, options, and search domains.
func (sb *Sandbox) loadResolvConf(path string) (*resolvconf.ResolvConf, error) {
        rc, err := resolvconf.Load(path)
        if err != nil && !errors.Is(err, fs.ErrNotExist) {
                return nil, err
        }
        // Proceed with rc, which might be zero-valued if path does not exist.

        rc.SetHeader(`# Generated by Docker Engine.
# This file can be edited; Docker Engine will not make further changes once it
# has been modified.`)
        if len(sb.config.dnsList) > 0 {
                var dnsAddrs []netip.Addr
                for _, ns := range sb.config.dnsList {
                        addr, err := netip.ParseAddr(ns)
                        if err != nil {
                                return nil, errors.Wrapf(err, "bad nameserver address %s", ns)
                        }
                        dnsAddrs = append(dnsAddrs, addr)
                }
                rc.OverrideNameServers(dnsAddrs)
        }
        if len(sb.config.dnsSearchList) > 0 {
                rc.OverrideSearch(sb.config.dnsSearchList)
        }
        if len(sb.config.dnsOptionsList) > 0 {
                rc.OverrideOptions(sb.config.dnsOptionsList)
        }
        return &rc, nil
}

// For a new sandbox, write an initial version of the container's resolv.conf. It'll
// be a copy of the host's file, with overrides for nameservers, options and search
// domains applied.
func (sb *Sandbox) setupDNS() error {
        // Make sure the directory exists.
        sb.restoreResolvConfPath()
        dir, _ := filepath.Split(sb.config.resolvConfPath)
        if err := createBasePath(dir); err != nil {
                return err
        }

        rc, err := sb.loadResolvConf(sb.config.getOriginResolvConfPath())
        if err != nil {
                return err
        }
        return rc.WriteFile(sb.config.resolvConfPath, sb.config.resolvConfHashFile, filePerm)
}

// Called when an endpoint has joined the sandbox.
func (sb *Sandbox) updateDNS(ipv6Enabled bool) error {
        if mod, err := resolvconf.UserModified(sb.config.resolvConfPath, sb.config.resolvConfHashFile); err != nil || mod {
                return err
        }

        // Load the host's resolv.conf as a starting point.
        rc, err := sb.loadResolvConf(sb.config.getOriginResolvConfPath())
        if err != nil {
                return err
        }
        // For host-networking, no further change is needed.
        if !sb.config.useDefaultSandBox {
                // The legacy bridge network has no internal nameserver. So, strip localhost
                // nameservers from the host's config, then add default nameservers if there
                // are none remaining.
                rc.TransformForLegacyNw(ipv6Enabled)
        }
        return rc.WriteFile(sb.config.resolvConfPath, sb.config.resolvConfHashFile, filePerm)
}

// Embedded DNS server has to be enabled for this sandbox. Rebuild the container's resolv.conf.
func (sb *Sandbox) rebuildDNS() error {
        // Don't touch the file if the user has modified it.
        if mod, err := resolvconf.UserModified(sb.config.resolvConfPath, sb.config.resolvConfHashFile); err != nil || mod {
                return err
        }

        // Load the host's resolv.conf as a starting point.
        rc, err := sb.loadResolvConf(sb.config.getOriginResolvConfPath())
        if err != nil {
                return err
        }

        intNS := sb.resolver.NameServer()
        if !intNS.IsValid() {
                return errors.New("no listen-address for internal resolver")
        }

        // Work out whether ndots has been set from host config or overrides.
        _, sb.ndotsSet = rc.Option("ndots")
        // Swap nameservers for the internal one, and make sure the required options are set.
        var extNameServers []resolvconf.ExtDNSEntry
        extNameServers, err = rc.TransformForIntNS(intNS, sb.resolver.ResolverOptions())
        if err != nil {
                return err
        }
        // Extract the list of nameservers that just got swapped out, and store them as
        // upstream nameservers.
        sb.setExternalResolvers(extNameServers)

        // Write the file for the container - preserving old behaviour, not updating the
        // hash file (so, no further updates will be made).
        // TODO(robmry) - I think that's probably accidental, I can't find a reason for it,
        //  and the old resolvconf.Build() function wrote the file but not the hash, which
        //  is surprising. But, before fixing it, a guard/flag needs to be added to
        //  sb.updateDNS() to make sure that when an endpoint joins a sandbox that already
        //  has an internal resolver, the container's resolv.conf is still (re)configured
        //  for an internal resolver.
        return rc.WriteFile(sb.config.resolvConfPath, "", filePerm)
}

func createBasePath(dir string) error {
        return os.MkdirAll(dir, dirPerm)
}

func copyFile(src, dst string) error {
        sBytes, err := os.ReadFile(src)
        if err != nil {
                return err
        }
        return os.WriteFile(dst, sBytes, filePerm)
}

//go:build linux || freebsd

package libnetwork

import (
        "context"
        "encoding/json"
        "flag"
        "fmt"
        "io"
        "net"
        "os"
        "path/filepath"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/types"
        "github.com/docker/docker/internal/otelutil"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/sys/reexec"
        "github.com/opencontainers/runtime-spec/specs-go"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/propagation"
        "go.opentelemetry.io/otel/trace"
)

const (
        execSubdir      = "libnetwork"
        defaultExecRoot = "/run/docker"
        success         = "success"
)

func init() {
        // TODO(thaJeztah): should this actually be registered on FreeBSD, or only on Linux?
        reexec.Register("libnetwork-setkey", processSetKeyReexec)
}

type setKeyData struct {
        ContainerID string
        Key         string
        OTelTrace   propagation.MapCarrier
}

// processSetKeyReexec is a private function that must be called only on an reexec path
// It expects 3 args { [0] = "libnetwork-setkey", [1] = <container-id>, [2] = <short-controller-id> }
// It also expects specs.State as a json string in <stdin>
// Refer to https://github.com/opencontainers/runc/pull/160/ for more information
// The docker exec-root can be specified as "-exec-root" flag. The default value is "/run/docker".
func processSetKeyReexec() {
        tc := propagation.TraceContext{}
        otel.SetTextMapPropagator(tc)
        carrier := otelutil.PropagateFromEnvironment()
        ctx := tc.Extract(context.Background(), carrier)

        if err := setKey(ctx); err != nil {
                _, _ = fmt.Fprintln(os.Stderr, err)
                os.Exit(1)
        }
}

func setKey(ctx context.Context) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.setKey", trace.WithSpanKind(trace.SpanKindServer))
        defer span.End()

        execRoot := flag.String("exec-root", defaultExecRoot, "docker exec root")
        flag.Parse()

        // expecting 3 os.Args {[0]="libnetwork-setkey", [1]=<container-id>, [2]=<short-controller-id> }
        // (i.e. expecting 2 flag.Args())
        args := flag.Args()
        if len(args) < 2 {
                return fmt.Errorf("re-exec expects 2 args (after parsing flags), received : %d", len(args))
        }
        containerID, shortCtlrID := args[0], args[1]

        // We expect specs.State as a json string in <stdin>
        var state specs.State
        if err := json.NewDecoder(os.Stdin).Decode(&state); err != nil {
                return err
        }

        return setExternalKey(ctx, shortCtlrID, containerID, fmt.Sprintf("/proc/%d/ns/net", state.Pid), *execRoot)
}

// setExternalKey provides a convenient way to set an External key to a sandbox
func setExternalKey(ctx context.Context, shortCtlrID string, containerID string, key string, execRoot string) error {
        uds := filepath.Join(execRoot, execSubdir, shortCtlrID+".sock")
        c, err := net.Dial("unix", uds)
        if err != nil {
                return err
        }
        defer c.Close()

        d := setKeyData{
                ContainerID: containerID,
                Key:         key,
                OTelTrace:   propagation.MapCarrier{},
        }
        otel.GetTextMapPropagator().Inject(ctx, d.OTelTrace)

        if err := json.NewEncoder(c).Encode(d); err != nil {
                return fmt.Errorf("sendKey failed with : %v", err)
        }
        return processReturn(c)
}

func processReturn(r io.Reader) error {
        buf := make([]byte, 1024)
        n, err := r.Read(buf[:])
        if err != nil {
                return fmt.Errorf("failed to read buf in processReturn : %v", err)
        }
        if string(buf[0:n]) != success {
                return fmt.Errorf("%s", buf[0:n])
        }
        return nil
}

func (c *Controller) startExternalKeyListener() error {
        execRoot := defaultExecRoot
        if v := c.Config().ExecRoot; v != "" {
                execRoot = v
        }
        udsBase := filepath.Join(execRoot, execSubdir)
        if err := os.MkdirAll(udsBase, 0o600); err != nil {
                return err
        }
        shortCtlrID := stringid.TruncateID(c.id)
        uds := filepath.Join(udsBase, shortCtlrID+".sock")
        l, err := net.Listen("unix", uds)
        if err != nil {
                return err
        }
        if err := os.Chmod(uds, 0o600); err != nil {
                l.Close()
                return err
        }
        c.mu.Lock()
        c.extKeyListener = l
        c.mu.Unlock()

        go c.acceptClientConnections(uds, l)
        return nil
}

func (c *Controller) acceptClientConnections(sock string, l net.Listener) {
        for {
                conn, err := l.Accept()
                if err != nil {
                        if _, err1 := os.Stat(sock); os.IsNotExist(err1) {
                                // This happens when the socket is closed by the daemon, eg. during shutdown.
                                log.G(context.TODO()).Debugf("Unix socket %s was closed. The external key listener will stop.", sock)
                                return
                        }
                        log.G(context.TODO()).Errorf("Error accepting connection %v", err)
                        continue
                }
                go func() {
                        defer conn.Close()

                        err := c.processExternalKey(conn)
                        ret := success
                        if err != nil {
                                ret = err.Error()
                        }

                        _, err = conn.Write([]byte(ret))
                        if err != nil {
                                log.G(context.TODO()).Errorf("Error returning to the client %v", err)
                        }
                }()
        }
}

func (c *Controller) processExternalKey(conn net.Conn) error {
        buf := make([]byte, 1280)
        nr, err := conn.Read(buf)
        if err != nil {
                return err
        }
        var s setKeyData
        if err = json.Unmarshal(buf[0:nr], &s); err != nil {
                return err
        }
        ctx := otel.GetTextMapPropagator().Extract(context.Background(), s.OTelTrace)
        sb, err := c.GetSandbox(s.ContainerID)
        if err != nil {
                return types.InvalidParameterErrorf("failed to get sandbox for %s", s.ContainerID)
        }
        return sb.SetKey(ctx, s.Key)
}

func (c *Controller) stopExternalKeyListener() {
        c.extKeyListener.Close()
}

package libnetwork

import (
        "context"
        "fmt"
        "net"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/netutils"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/libnetwork/types"
        "go.opentelemetry.io/otel"
        "go.opentelemetry.io/otel/attribute"
        "go.opentelemetry.io/otel/trace"
)

// Linux-specific container configuration flags.
type containerConfigOS struct{} //nolint:nolintlint,unused // only populated on windows

func releaseOSSboxResources(ns *osl.Namespace, ep *Endpoint) {
        for _, i := range ns.Interfaces() {
                // Only remove the interfaces owned by this endpoint from the sandbox.
                if ep.hasInterface(i.SrcName()) {
                        if err := i.Remove(); err != nil {
                                log.G(context.TODO()).Debugf("Remove interface %s failed: %v", i.SrcName(), err)
                        }
                }
        }

        ep.mu.Lock()
        joinInfo := ep.joinInfo
        vip := ep.virtualIP
        lbModeIsDSR := ep.network.loadBalancerMode == loadBalancerModeDSR
        ep.mu.Unlock()

        if len(vip) > 0 && lbModeIsDSR {
                ipNet := &net.IPNet{IP: vip, Mask: net.CIDRMask(32, 32)}
                if err := ns.RemoveAliasIP(ns.GetLoopbackIfaceName(), ipNet); err != nil {
                        log.G(context.TODO()).WithError(err).Debugf("failed to remove virtual ip %v to loopback", ipNet)
                }
        }

        if joinInfo == nil {
                return
        }

        // Remove non-interface routes.
        for _, r := range joinInfo.StaticRoutes {
                if err := ns.RemoveStaticRoute(r); err != nil {
                        log.G(context.TODO()).Debugf("Remove route failed: %v", err)
                }
        }
}

// Statistics retrieves the interfaces' statistics for the sandbox.
func (sb *Sandbox) Statistics() (map[string]*types.InterfaceStatistics, error) {
        m := make(map[string]*types.InterfaceStatistics)

        sb.mu.Lock()
        osb := sb.osSbox
        sb.mu.Unlock()
        if osb == nil {
                return m, nil
        }

        var err error
        for _, i := range osb.Interfaces() {
                if m[i.DstName()], err = i.Statistics(); err != nil {
                        return m, err
                }
        }

        return m, nil
}

func (sb *Sandbox) updateGateway(ep4, ep6 *Endpoint) error {
        var populated4, populated6 bool
        sb.mu.Lock()
        osSbox := sb.osSbox
        if ep4 != nil {
                _, populated4 = sb.populatedEndpoints[ep4.ID()]
        }
        if ep6 != nil {
                _, populated6 = sb.populatedEndpoints[ep6.ID()]
        }
        sb.mu.Unlock()
        if osSbox == nil {
                return nil
        }
        osSbox.UnsetGateway()     //nolint:errcheck
        osSbox.UnsetGatewayIPv6() //nolint:errcheck
        if err := osSbox.UnsetDefaultRouteIPv4(); err != nil {
                log.G(context.TODO()).WithError(err).Warn("removing IPv4 default route")
        }
        if err := osSbox.UnsetDefaultRouteIPv6(); err != nil {
                log.G(context.TODO()).WithError(err).Warn("removing IPv6 default route")
        }

        if populated4 {
                ep4.mu.Lock()
                joinInfo := ep4.joinInfo
                ep4.mu.Unlock()

                if joinInfo.gw != nil {
                        if err := osSbox.SetGateway(joinInfo.gw); err != nil {
                                return fmt.Errorf("failed to set gateway: %v", err)
                        }
                } else {
                        if err := osSbox.SetDefaultRouteIPv4(ep4.iface.srcName); err != nil {
                                return fmt.Errorf("failed to set IPv4 default route: %v", err)
                        }
                }
        }

        if populated6 {
                ep6.mu.Lock()
                joinInfo := ep6.joinInfo
                ep6.mu.Unlock()

                if joinInfo.gw6 != nil {
                        if err := osSbox.SetGatewayIPv6(joinInfo.gw6); err != nil {
                                return fmt.Errorf("failed to set IPv6 gateway: %v", err)
                        }
                } else {
                        if err := osSbox.SetDefaultRouteIPv6(ep6.iface.srcName); err != nil {
                                return fmt.Errorf("failed to set IPv6 default route: %v", err)
                        }
                }
        }

        return nil
}

func (sb *Sandbox) ExecFunc(f func()) error {
        sb.mu.Lock()
        osSbox := sb.osSbox
        sb.mu.Unlock()
        if osSbox != nil {
                return osSbox.InvokeFunc(f)
        }
        return fmt.Errorf("osl sandbox unavailable in ExecFunc for %v", sb.ContainerID())
}

// SetKey updates the Sandbox Key.
func (sb *Sandbox) SetKey(ctx context.Context, basePath string) error {
        start := time.Now()
        defer func() {
                log.G(ctx).Debugf("sandbox set key processing took %s for container %s", time.Since(start), sb.ContainerID())
        }()

        if basePath == "" {
                return types.InvalidParameterErrorf("invalid sandbox key")
        }

        sb.mu.Lock()
        if sb.inDelete {
                sb.mu.Unlock()
                return types.ForbiddenErrorf("failed to SetKey: sandbox %q delete in progress", sb.id)
        }
        oldosSbox := sb.osSbox
        sb.mu.Unlock()

        if oldosSbox != nil {
                // If we already have an OS sandbox, release the network resources from that
                // and destroy the OS snab. We are moving into a new home further down. Note that none
                // of the network resources gets destroyed during the move.
                if err := sb.releaseOSSbox(); err != nil {
                        log.G(ctx).WithError(err).Error("Error destroying os sandbox")
                }
        }

        osSbox, err := osl.GetSandboxForExternalKey(basePath, sb.Key())
        if err != nil {
                return err
        }

        sb.mu.Lock()
        sb.osSbox = osSbox
        sb.mu.Unlock()

        // If the resolver was setup before stop it and set it up in the
        // new osl sandbox.
        if oldosSbox != nil && sb.resolver != nil {
                sb.resolver.Stop()

                if err := sb.osSbox.InvokeFunc(sb.resolver.SetupFunc(0)); err == nil {
                        if err := sb.resolver.Start(); err != nil {
                                log.G(ctx).Errorf("Resolver Start failed for container %s, %q", sb.ContainerID(), err)
                        }
                } else {
                        log.G(ctx).Errorf("Resolver Setup Function failed for container %s, %q", sb.ContainerID(), err)
                }
        }

        osSbox.RefreshIPv6LoEnabled()
        if err := sb.rebuildHostsFile(ctx); err != nil {
                return err
        }

        for _, ep := range sb.Endpoints() {
                if err = sb.populateNetworkResources(ctx, ep); err != nil {
                        return err
                }
        }

        return nil
}

// NetnsPath returns the network namespace's path and true, if a network has been
// created - else the empty string and false.
func (sb *Sandbox) NetnsPath() (path string, ok bool) {
        sb.mu.Lock()
        osSbox := sb.osSbox
        sb.mu.Unlock()
        if osSbox == nil {
                return "", false
        }
        return osSbox.Key(), true
}

// IPv6Enabled determines whether a container supports IPv6.
// IPv6 support can always be determined for host networking. For other network
// types it can only be determined once there's a container namespace to probe,
// return ok=false in that case.
func (sb *Sandbox) IPv6Enabled() (enabled, ok bool) {
        // For host networking, IPv6 support depends on the host.
        if sb.config.useDefaultSandBox {
                return netutils.IsV6Listenable(), true
        }

        // For other network types, look at whether the container's loopback interface has an IPv6 address.
        sb.mu.Lock()
        osSbox := sb.osSbox
        sb.mu.Unlock()

        if osSbox == nil {
                return false, false
        }
        return osSbox.IPv6LoEnabled(), true
}

func (sb *Sandbox) releaseOSSbox() error {
        sb.mu.Lock()
        osSbox := sb.osSbox
        sb.osSbox = nil
        sb.mu.Unlock()

        if osSbox == nil {
                return nil
        }

        for _, ep := range sb.Endpoints() {
                releaseOSSboxResources(osSbox, ep)
        }

        return osSbox.Destroy()
}

func (sb *Sandbox) restoreOslSandbox() error {
        var routes []*types.StaticRoute

        // restore osl sandbox
        interfaces := make(map[osl.Iface][]osl.IfaceOption)
        for _, ep := range sb.endpoints {
                ep.mu.Lock()
                joinInfo := ep.joinInfo
                i := ep.iface
                ep.mu.Unlock()

                if i == nil {
                        log.G(context.TODO()).Errorf("error restoring endpoint %s for container %s", ep.Name(), sb.ContainerID())
                        continue
                }

                ifaceOptions := []osl.IfaceOption{
                        osl.WithIPv4Address(i.addr),
                        osl.WithRoutes(i.routes),
                }
                if i.addrv6 != nil && i.addrv6.IP.To16() != nil {
                        ifaceOptions = append(ifaceOptions, osl.WithIPv6Address(i.addrv6))
                }
                if i.mac != nil {
                        ifaceOptions = append(ifaceOptions, osl.WithMACAddress(i.mac))
                }
                if len(i.llAddrs) != 0 {
                        ifaceOptions = append(ifaceOptions, osl.WithLinkLocalAddresses(i.llAddrs))
                }
                iface := osl.Iface{SrcName: i.srcName, DstPrefix: i.dstPrefix, DstName: i.dstName}
                interfaces[iface] = ifaceOptions
                if joinInfo != nil {
                        routes = append(routes, joinInfo.StaticRoutes...)
                }
                if ep.needResolver() {
                        sb.startResolver(true)
                }
        }

        if err := sb.osSbox.RestoreInterfaces(interfaces); err != nil {
                return err
        }
        if len(routes) > 0 {
                sb.osSbox.RestoreRoutes(routes)
        }
        if gwEp4, gwEp6 := sb.getGatewayEndpoint(); gwEp4 != nil || gwEp6 != nil {
                if gwEp4 != nil {
                        sb.osSbox.RestoreGateway(true, gwEp4.joinInfo.gw, gwEp4.iface.srcName)
                }
                if gwEp6 != nil {
                        sb.osSbox.RestoreGateway(false, gwEp6.joinInfo.gw6, gwEp6.iface.srcName)
                }
        }

        return nil
}

func (sb *Sandbox) populateNetworkResources(ctx context.Context, ep *Endpoint) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.Sandbox.populateNetworkResources", trace.WithAttributes(
                attribute.String("endpoint.Name", ep.Name())))
        defer span.End()

        sb.mu.Lock()
        if sb.osSbox == nil {
                sb.mu.Unlock()
                return nil
        }
        inDelete := sb.inDelete
        sb.mu.Unlock()

        ep.mu.Lock()
        joinInfo := ep.joinInfo
        i := ep.iface
        lbModeIsDSR := ep.network.loadBalancerMode == loadBalancerModeDSR
        ep.mu.Unlock()

        if ep.needResolver() {
                sb.startResolver(false)
        }

        if i != nil && i.srcName != "" {
                var ifaceOptions []osl.IfaceOption

                ifaceOptions = append(ifaceOptions, osl.WithIPv4Address(i.addr), osl.WithRoutes(i.routes))
                if i.addrv6 != nil && i.addrv6.IP.To16() != nil {
                        ifaceOptions = append(ifaceOptions, osl.WithIPv6Address(i.addrv6))
                }
                if len(i.llAddrs) != 0 {
                        ifaceOptions = append(ifaceOptions, osl.WithLinkLocalAddresses(i.llAddrs))
                }
                if i.mac != nil {
                        ifaceOptions = append(ifaceOptions, osl.WithMACAddress(i.mac))
                }
                if sysctls := ep.getSysctls(); len(sysctls) > 0 {
                        ifaceOptions = append(ifaceOptions, osl.WithSysctls(sysctls))
                }
                if n := ep.getNetwork(); n != nil {
                        if nMsgs, ok := n.advertiseAddrNMsgs(); ok {
                                ifaceOptions = append(ifaceOptions, osl.WithAdvertiseAddrNMsgs(nMsgs))
                        }
                        if interval, ok := n.advertiseAddrInterval(); ok {
                                ifaceOptions = append(ifaceOptions, osl.WithAdvertiseAddrInterval(interval))
                        }
                }
                ifaceOptions = append(ifaceOptions, osl.WithCreatedInContainer(i.createdInContainer))

                if err := sb.osSbox.AddInterface(ctx, i.srcName, i.dstPrefix, i.dstName, ifaceOptions...); err != nil {
                        return fmt.Errorf("failed to add interface %s to sandbox: %v", i.srcName, err)
                }

                if len(ep.virtualIP) > 0 && lbModeIsDSR {
                        if sb.loadBalancerNID == "" {
                                if err := sb.osSbox.DisableARPForVIP(i.srcName); err != nil {
                                        return fmt.Errorf("failed disable ARP for VIP: %v", err)
                                }
                        }
                        ipNet := &net.IPNet{IP: ep.virtualIP, Mask: net.CIDRMask(32, 32)}
                        if err := sb.osSbox.AddAliasIP(sb.osSbox.GetLoopbackIfaceName(), ipNet); err != nil {
                                return fmt.Errorf("failed to add virtual ip %v to loopback: %v", ipNet, err)
                        }
                }
        }

        if joinInfo != nil {
                // Set up non-interface routes.
                for _, r := range joinInfo.StaticRoutes {
                        if err := sb.osSbox.AddStaticRoute(r); err != nil {
                                return fmt.Errorf("failed to add static route %s: %v", r.Destination.String(), err)
                        }
                }
        }

        // Make sure to add the endpoint to the populated endpoint set
        // before updating gateways or populating loadbalancers.
        sb.mu.Lock()
        sb.populatedEndpoints[ep.ID()] = struct{}{}
        sb.mu.Unlock()

        if gw4, gw6 := sb.getGatewayEndpoint(); ep == gw4 || ep == gw6 {
                if err := sb.updateGateway(gw4, gw6); err != nil {
                        return fmt.Errorf("updating gateway endpoint: %w", err)
                }
        }

        // Populate load balancer only after updating all the other
        // information including gateway and other routes so that
        // loadbalancers are populated all the network state is in
        // place in the sandbox.
        sb.populateLoadBalancers(ep)

        // Only update the store if we did not come here as part of
        // sandbox delete. If we came here as part of delete then do
        // not bother updating the store. The sandbox object will be
        // deleted anyway
        if !inDelete {
                return sb.storeUpdate(ctx)
        }

        return nil
}

package libnetwork

import (
        "github.com/docker/docker/daemon/libnetwork/netlabel"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/libnetwork/types"
)

// OptionHostname function returns an option setter for hostname option to
// be passed to NewSandbox method.
func OptionHostname(name string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.hostName = name
        }
}

// OptionDomainname function returns an option setter for domainname option to
// be passed to NewSandbox method.
func OptionDomainname(name string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.domainName = name
        }
}

// OptionHostsPath function returns an option setter for hostspath option to
// be passed to NewSandbox method.
func OptionHostsPath(path string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.hostsPath = path
        }
}

// OptionOriginHostsPath function returns an option setter for origin hosts file path
// to be passed to NewSandbox method.
func OptionOriginHostsPath(path string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.originHostsPath = path
        }
}

// OptionExtraHost function returns an option setter for extra /etc/hosts options
// which is a name and IP as strings.
func OptionExtraHost(name string, IP string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.extraHosts = append(sb.config.extraHosts, extraHost{name: name, IP: IP})
        }
}

// OptionResolvConfPath function returns an option setter for resolvconfpath option to
// be passed to net container methods.
func OptionResolvConfPath(path string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.resolvConfPath = path
        }
}

// OptionOriginResolvConfPath function returns an option setter to set the path to the
// origin resolv.conf file to be passed to net container methods.
func OptionOriginResolvConfPath(path string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.originResolvConfPath = path
        }
}

// OptionDNS function returns an option setter for dns entry option to
// be passed to container Create method.
func OptionDNS(dns []string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.dnsList = dns
        }
}

// OptionDNSSearch function returns an option setter for dns search entry option to
// be passed to container Create method.
func OptionDNSSearch(search []string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.dnsSearchList = search
        }
}

// OptionDNSOptions function returns an option setter for dns options entry option to
// be passed to container Create method.
func OptionDNSOptions(options []string) SandboxOption {
        return func(sb *Sandbox) {
                sb.config.dnsOptionsList = options
        }
}

// OptionUseDefaultSandbox function returns an option setter for using default sandbox
// (host namespace) to be passed to container Create method.
func OptionUseDefaultSandbox() SandboxOption {
        return func(sb *Sandbox) {
                sb.config.useDefaultSandBox = true
        }
}

// OptionUseExternalKey function returns an option setter for using provided namespace
// instead of creating one.
func OptionUseExternalKey() SandboxOption {
        return func(sb *Sandbox) {
                sb.config.useExternalKey = true
        }
}

// OptionExposedPorts function returns an option setter for the container exposed
// ports option to be passed to container Create method.
func OptionExposedPorts(exposedPorts []types.TransportPort) SandboxOption {
        return func(sb *Sandbox) {
                if sb.config.generic == nil {
                        sb.config.generic = make(map[string]interface{})
                }
                // Defensive copy
                eps := make([]types.TransportPort, len(exposedPorts))
                copy(eps, exposedPorts)
                // Store endpoint label and in generic because driver needs it
                sb.config.exposedPorts = eps
                sb.config.generic[netlabel.ExposedPorts] = eps
        }
}

// OptionPortMapping function returns an option setter for the mapping
// ports option to be passed to container Create method.
func OptionPortMapping(portBindings []types.PortBinding) SandboxOption {
        return func(sb *Sandbox) {
                if sb.config.generic == nil {
                        sb.config.generic = make(map[string]interface{})
                }
                // Store a copy of the bindings as generic data to pass to the driver
                pbs := make([]types.PortBinding, len(portBindings))
                copy(pbs, portBindings)
                sb.config.generic[netlabel.PortMap] = pbs
        }
}

// OptionIngress function returns an option setter for marking a
// sandbox as the controller's ingress sandbox.
func OptionIngress() SandboxOption {
        return func(sb *Sandbox) {
                sb.ingress = true
                sb.oslTypes = append(sb.oslTypes, osl.SandboxTypeIngress)
        }
}

// OptionLoadBalancer function returns an option setter for marking a
// sandbox as a load balancer sandbox.
func OptionLoadBalancer(nid string) SandboxOption {
        return func(sb *Sandbox) {
                sb.loadBalancerNID = nid
                sb.oslTypes = append(sb.oslTypes, osl.SandboxTypeLoadBalancer)
        }
}

package libnetwork

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/osl"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "github.com/docker/docker/pkg/stringid"
)

const (
        sandboxPrefix = "sandbox"
)

type epState struct {
        Eid string
        Nid string
}

type sbState struct {
        ID         string
        Cid        string
        c          *Controller
        dbIndex    uint64
        dbExists   bool
        Eps        []epState
        EpPriority map[string]int
        // external servers have to be persisted so that on restart of a live-restore
        // enabled daemon we get the external servers for the running containers.
        //
        // It is persisted as "ExtDNS2" for historical reasons. ExtDNS2 was used to
        // handle migration between docker < 1.14 and >= 1.14. Before version 1.14 we
        // used ExtDNS but with a []string. As it's unlikely that installations still
        // have state from before 1.14, we've dropped the migration code.
        ExtDNS []extDNSEntry `json:"ExtDNS2"`
}

func (sbs *sbState) Key() []string {
        return []string{sandboxPrefix, sbs.ID}
}

func (sbs *sbState) KeyPrefix() []string {
        return []string{sandboxPrefix}
}

func (sbs *sbState) Value() []byte {
        b, err := json.Marshal(sbs)
        if err != nil {
                return nil
        }
        return b
}

func (sbs *sbState) SetValue(value []byte) error {
        return json.Unmarshal(value, sbs)
}

func (sbs *sbState) Index() uint64 {
        sb, err := sbs.c.SandboxByID(sbs.ID)
        if err != nil {
                return sbs.dbIndex
        }

        maxIndex := sb.dbIndex
        if sbs.dbIndex > maxIndex {
                maxIndex = sbs.dbIndex
        }

        return maxIndex
}

func (sbs *sbState) SetIndex(index uint64) {
        sbs.dbIndex = index
        sbs.dbExists = true

        sb, err := sbs.c.SandboxByID(sbs.ID)
        if err != nil {
                return
        }

        sb.dbIndex = index
        sb.dbExists = true
}

func (sbs *sbState) Exists() bool {
        if sbs.dbExists {
                return sbs.dbExists
        }

        sb, err := sbs.c.SandboxByID(sbs.ID)
        if err != nil {
                return false
        }

        return sb.dbExists
}

func (sbs *sbState) Skip() bool {
        return false
}

func (sbs *sbState) New() datastore.KVObject {
        return &sbState{c: sbs.c}
}

func (sbs *sbState) CopyTo(o datastore.KVObject) error {
        dstSbs := o.(*sbState)
        dstSbs.c = sbs.c
        dstSbs.ID = sbs.ID
        dstSbs.Cid = sbs.Cid
        dstSbs.dbIndex = sbs.dbIndex
        dstSbs.dbExists = sbs.dbExists
        dstSbs.EpPriority = sbs.EpPriority

        dstSbs.Eps = append(dstSbs.Eps, sbs.Eps...)
        dstSbs.ExtDNS = append(dstSbs.ExtDNS, sbs.ExtDNS...)

        return nil
}

func (sb *Sandbox) storeUpdate(ctx context.Context) error {
        sbs := &sbState{
                c:          sb.controller,
                ID:         sb.id,
                Cid:        sb.containerID,
                EpPriority: sb.epPriority,
                ExtDNS:     sb.extDNS,
        }

retry:
        sbs.Eps = nil
        for _, ep := range sb.Endpoints() {
                // If the endpoint is not persisted then do not add it to
                // the sandbox checkpoint
                if ep.Skip() {
                        continue
                }

                sbs.Eps = append(sbs.Eps, epState{
                        Nid: ep.getNetwork().ID(),
                        Eid: ep.ID(),
                })
        }

        err := sb.controller.updateToStore(ctx, sbs)
        if errors.Is(err, datastore.ErrKeyModified) {
                // When we get ErrKeyModified it is sufficient to just
                // go back and retry.  No need to get the object from
                // the store because we always regenerate the store
                // state from in memory sandbox state
                goto retry
        }

        return err
}

func (sb *Sandbox) storeDelete() error {
        return sb.controller.store.DeleteObject(&sbState{
                c:        sb.controller,
                ID:       sb.id,
                Cid:      sb.containerID,
                dbExists: sb.dbExists,
        })
}

// sandboxRestore restores Sandbox objects from the store, deleting them if they're not active.
func (c *Controller) sandboxRestore(activeSandboxes map[string]interface{}) error {
        sandboxStates, err := c.store.List(&sbState{c: c})
        if err != nil {
                if errors.Is(err, datastore.ErrKeyNotFound) {
                        // It's normal for no sandboxes to be found. Just bail out.
                        return nil
                }
                return fmt.Errorf("failed to get sandboxes: %v", err)
        }

        for _, s := range sandboxStates {
                sbs := s.(*sbState)
                sb := &Sandbox{
                        id:                 sbs.ID,
                        controller:         sbs.c,
                        containerID:        sbs.Cid,
                        epPriority:         sbs.EpPriority,
                        extDNS:             sbs.ExtDNS,
                        endpoints:          []*Endpoint{},
                        populatedEndpoints: map[string]struct{}{},
                        dbIndex:            sbs.dbIndex,
                        isStub:             true,
                        dbExists:           true,
                }

                create := true
                isRestore := false
                if val, ok := activeSandboxes[sb.ID()]; ok {
                        sb.isStub = false
                        isRestore = true
                        opts := val.([]SandboxOption)
                        sb.processOptions(opts...)
                        sb.restoreHostsPath()
                        sb.restoreResolvConfPath()
                        create = !sb.config.useDefaultSandBox
                }

                ctx := context.TODO()
                ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                        "sid":       stringid.TruncateID(sb.ID()),
                        "cid":       stringid.TruncateID(sb.ContainerID()),
                        "isRestore": isRestore,
                }))

                sb.osSbox, err = osl.NewSandbox(sb.Key(), create, isRestore)
                if err != nil {
                        log.G(ctx).WithError(err).Error("Failed to create osl sandbox while trying to restore sandbox")
                        continue
                }

                c.mu.Lock()
                c.sandboxes[sb.id] = sb
                c.mu.Unlock()

                for _, eps := range sbs.Eps {
                        ctx := log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                                "nid": stringid.TruncateID(eps.Nid),
                                "eid": stringid.TruncateID(eps.Eid),
                        }))
                        // If the Network or Endpoint can't be loaded from the store, log and continue. Something
                        // might go wrong later, but it might just be a reference to a deleted network/endpoint
                        // (in which case, the best thing to do is to continue to run/delete the Sandbox with the
                        // available configuration).
                        n, err := c.getNetworkFromStore(eps.Nid)
                        if err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to restore endpoint, getNetworkFromStore failed")
                                continue
                        }
                        ep, err := n.getEndpointFromStore(eps.Eid)
                        if err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to restore endpoint, getEndpointFromStore failed")
                                continue
                        }
                        sb.addEndpoint(ep)
                }

                if !isRestore {
                        log.G(ctx).Info("Removing stale sandbox")
                        if err := sb.delete(context.WithoutCancel(ctx), true); err != nil {
                                log.G(ctx).WithError(err).Warn("Failed to delete sandbox while trying to clean up")
                        }
                        continue
                }

                for _, ep := range sb.endpoints {
                        sb.populatedEndpoints[ep.id] = struct{}{}
                }

                // reconstruct osl sandbox field
                if !sb.config.useDefaultSandBox {
                        if err := sb.restoreOslSandbox(); err != nil {
                                log.G(ctx).WithError(err).Error("Failed to populate fields for osl sandbox")
                                continue
                        }
                } else {
                        // FIXME(thaJeztah): osSbox (and thus defOsSbox) is always nil on non-Linux: move this code to Linux-only files.
                        c.defOsSboxOnce.Do(func() {
                                c.defOsSbox = sb.osSbox
                        })
                }

                for _, ep := range sb.endpoints {
                        if !c.isAgent() {
                                n := ep.getNetwork()
                                if !c.isSwarmNode() || n.Scope() != scope.Swarm || !n.driverIsMultihost() {
                                        n.updateSvcRecord(context.WithoutCancel(ctx), ep, true)
                                }
                        }
                }
        }

        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package libnetwork

import (
        "fmt"
        "net"
        "sync"

        "github.com/docker/docker/daemon/libnetwork/internal/setmatrix"
)

var (
        // A global monotonic counter to assign firewall marks to
        // services.
        fwMarkCtr   uint32 = 256
        fwMarkCtrMu sync.Mutex
)

type portConfigs []*PortConfig

func (p portConfigs) String() string {
        if len(p) == 0 {
                return ""
        }

        pc := p[0]
        str := fmt.Sprintf("%d:%d/%s", pc.PublishedPort, pc.TargetPort, PortConfig_Protocol_name[int32(pc.Protocol)])
        for _, pc := range p[1:] {
                str = str + fmt.Sprintf(",%d:%d/%s", pc.PublishedPort, pc.TargetPort, PortConfig_Protocol_name[int32(pc.Protocol)])
        }

        return str
}

type serviceKey struct {
        id    string
        ports string
}

type service struct {
        name string // Service Name
        id   string // Service ID

        // Map of loadbalancers for the service one-per attached
        // network. It is keyed with network ID.
        loadBalancers map[string]*loadBalancer

        // List of ingress ports exposed by the service
        ingressPorts portConfigs

        // Service aliases
        aliases []string

        // This maps tracks for each IP address the list of endpoints ID
        // associated with it. At stable state the endpoint ID expected is 1
        // but during transition and service change it is possible to have
        // temporary more than 1
        ipToEndpoint setmatrix.SetMatrix[string, string]

        deleted bool

        sync.Mutex
}

// assignIPToEndpoint inserts the mapping between the IP and the endpoint identifier
// returns true if the mapping was not present, false otherwise
// returns also the number of endpoints associated to the IP
func (s *service) assignIPToEndpoint(ip, eID string) (bool, int) {
        return s.ipToEndpoint.Insert(ip, eID)
}

// removeIPToEndpoint removes the mapping between the IP and the endpoint identifier
// returns true if the mapping was deleted, false otherwise
// returns also the number of endpoints associated to the IP
func (s *service) removeIPToEndpoint(ip, eID string) (bool, int) {
        return s.ipToEndpoint.Remove(ip, eID)
}

func (s *service) printIPToEndpoint(ip string) (string, bool) {
        return s.ipToEndpoint.String(ip)
}

type lbBackend struct {
        ip       net.IP
        disabled bool
}

type loadBalancer struct {
        vip    net.IP
        fwMark uint32

        // Map of backend IPs backing this loadbalancer on this
        // network. It is keyed with endpoint ID.
        backEnds map[string]*lbBackend

        // Back pointer to service to which the loadbalancer belongs.
        service *service
        sync.Mutex
}

//go:build linux || windows

package libnetwork

import (
        "context"
        "net"

        "github.com/containerd/log"
)

const maxSetStringLen = 350

func (c *Controller) addEndpointNameResolution(svcName, svcID, nID, eID, containerName string, vip net.IP, serviceAliases, taskAliases []string, ip net.IP, addService bool, method string) error {
        n, err := c.NetworkByID(nID)
        if err != nil {
                return err
        }

        log.G(context.TODO()).Debugf("addEndpointNameResolution %s %s add_service:%t sAliases:%v tAliases:%v", eID, svcName, addService, serviceAliases, taskAliases)

        // Add container resolution mappings
        if err := c.addContainerNameResolution(nID, eID, containerName, taskAliases, ip, method); err != nil {
                return err
        }

        serviceID := svcID
        if serviceID == "" {
                // This is the case of a normal container not part of a service
                serviceID = eID
        }

        // Add endpoint IP to special "tasks.svc_name" so that the applications have access to DNS RR.
        n.addSvcRecords(eID, "tasks."+svcName, serviceID, ip, nil, false, method)
        for _, alias := range serviceAliases {
                n.addSvcRecords(eID, "tasks."+alias, serviceID, ip, nil, false, method)
        }

        // Add service name to vip in DNS, if vip is valid. Otherwise resort to DNS RR
        if len(vip) == 0 {
                n.addSvcRecords(eID, svcName, serviceID, ip, nil, false, method)
                for _, alias := range serviceAliases {
                        n.addSvcRecords(eID, alias, serviceID, ip, nil, false, method)
                }
        }

        if addService && len(vip) != 0 {
                n.addSvcRecords(eID, svcName, serviceID, vip, nil, false, method)
                for _, alias := range serviceAliases {
                        n.addSvcRecords(eID, alias, serviceID, vip, nil, false, method)
                }
        }

        return nil
}

func (c *Controller) addContainerNameResolution(nID, eID, containerName string, taskAliases []string, ip net.IP, method string) error {
        n, err := c.NetworkByID(nID)
        if err != nil {
                return err
        }
        log.G(context.TODO()).Debugf("addContainerNameResolution %s %s", eID, containerName)

        // Add resolution for container name
        n.addSvcRecords(eID, containerName, eID, ip, nil, true, method)

        // Add resolution for taskaliases
        for _, alias := range taskAliases {
                n.addSvcRecords(eID, alias, eID, ip, nil, false, method)
        }

        return nil
}

func (c *Controller) deleteEndpointNameResolution(svcName, svcID, nID, eID, containerName string, vip net.IP, serviceAliases, taskAliases []string, ip net.IP, rmService, multipleEntries bool, method string) error {
        n, err := c.NetworkByID(nID)
        if err != nil {
                return err
        }

        log.G(context.TODO()).Debugf("deleteEndpointNameResolution %s %s rm_service:%t suppress:%t sAliases:%v tAliases:%v", eID, svcName, rmService, multipleEntries, serviceAliases, taskAliases)

        // Delete container resolution mappings
        if err := c.delContainerNameResolution(nID, eID, containerName, taskAliases, ip, method); err != nil {
                log.G(context.TODO()).WithError(err).Warn("Error deleting container from resolver")
        }

        serviceID := svcID
        if serviceID == "" {
                // This is the case of a normal container not part of a service
                serviceID = eID
        }

        // Delete the special "tasks.svc_name" backend record.
        if !multipleEntries {
                n.deleteSvcRecords(eID, "tasks."+svcName, serviceID, ip, nil, false, method)
                for _, alias := range serviceAliases {
                        n.deleteSvcRecords(eID, "tasks."+alias, serviceID, ip, nil, false, method)
                }
        }

        // If we are doing DNS RR delete the endpoint IP from DNS record right away.
        if !multipleEntries && len(vip) == 0 {
                n.deleteSvcRecords(eID, svcName, serviceID, ip, nil, false, method)
                for _, alias := range serviceAliases {
                        n.deleteSvcRecords(eID, alias, serviceID, ip, nil, false, method)
                }
        }

        // Remove the DNS record for VIP only if we are removing the service
        if rmService && len(vip) != 0 && !multipleEntries {
                n.deleteSvcRecords(eID, svcName, serviceID, vip, nil, false, method)
                for _, alias := range serviceAliases {
                        n.deleteSvcRecords(eID, alias, serviceID, vip, nil, false, method)
                }
        }

        return nil
}

func (c *Controller) delContainerNameResolution(nID, eID, containerName string, taskAliases []string, ip net.IP, method string) error {
        n, err := c.NetworkByID(nID)
        if err != nil {
                return err
        }
        log.G(context.TODO()).Debugf("delContainerNameResolution %s %s", eID, containerName)

        // Delete resolution for container name
        n.deleteSvcRecords(eID, containerName, eID, ip, nil, true, method)

        // Delete resolution for taskaliases
        for _, alias := range taskAliases {
                n.deleteSvcRecords(eID, alias, eID, ip, nil, true, method)
        }

        return nil
}

func newService(name string, id string, ingressPorts []*PortConfig, serviceAliases []string) *service {
        return &service{
                name:          name,
                id:            id,
                ingressPorts:  ingressPorts,
                loadBalancers: make(map[string]*loadBalancer),
                aliases:       serviceAliases,
        }
}

func (c *Controller) getLBIndex(sid, nid string, ingressPorts []*PortConfig) int {
        skey := serviceKey{
                id:    sid,
                ports: portConfigs(ingressPorts).String(),
        }
        c.mu.Lock()
        s, ok := c.serviceBindings[skey]
        c.mu.Unlock()

        if !ok {
                return 0
        }

        s.Lock()
        lb := s.loadBalancers[nid]
        s.Unlock()

        return int(lb.fwMark)
}

// cleanupServiceDiscovery when the network is being deleted, erase all the associated service discovery records
func (c *Controller) cleanupServiceDiscovery(cleanupNID string) {
        c.mu.Lock()
        defer c.mu.Unlock()
        if cleanupNID == "" {
                log.G(context.TODO()).Debugf("cleanupServiceDiscovery for all networks")
                c.svcRecords = make(map[string]*svcInfo)
                return
        }
        log.G(context.TODO()).Debugf("cleanupServiceDiscovery for network:%s", cleanupNID)
        delete(c.svcRecords, cleanupNID)
}

func (c *Controller) cleanupServiceBindings(cleanupNID string) {
        var cleanupFuncs []func()

        log.G(context.TODO()).Debugf("cleanupServiceBindings for %s", cleanupNID)
        c.mu.Lock()
        services := make([]*service, 0, len(c.serviceBindings))
        for _, s := range c.serviceBindings {
                services = append(services, s)
        }
        c.mu.Unlock()

        for _, s := range services {
                s.Lock()
                // Skip the serviceBindings that got deleted
                if s.deleted {
                        s.Unlock()
                        continue
                }
                for nid, lb := range s.loadBalancers {
                        if cleanupNID != "" && nid != cleanupNID {
                                continue
                        }
                        for eid, be := range lb.backEnds {
                                cleanupFuncs = append(cleanupFuncs, makeServiceCleanupFunc(c, s, nid, eid, lb.vip, be.ip))
                        }
                }
                s.Unlock()
        }

        for _, f := range cleanupFuncs {
                f()
        }
}

func makeServiceCleanupFunc(c *Controller, s *service, nID, eID string, vip net.IP, ip net.IP) func() {
        // ContainerName and taskAliases are not available here, this is still fine because the Service discovery
        // cleanup already happened before. The only thing that rmServiceBinding is still doing here a part from the Load
        // Balancer bookkeeping, is to keep consistent the mapping of endpoint to IP.
        return func() {
                if err := c.rmServiceBinding(s.name, s.id, nID, eID, "", vip, s.ingressPorts, s.aliases, []string{}, ip, "cleanupServiceBindings", false, true); err != nil {
                        log.G(context.TODO()).Errorf("Failed to remove service bindings for service %s network %s endpoint %s while cleanup: %v", s.id, nID, eID, err)
                }
        }
}

func (c *Controller) addServiceBinding(svcName, svcID, nID, eID, containerName string, vip net.IP, ingressPorts []*PortConfig, serviceAliases, taskAliases []string, ip net.IP, method string) error {
        var addService bool

        // Failure to lock the network ID on add can result in racing
        // against network deletion resulting in inconsistent state
        // in the c.serviceBindings map and it's sub-maps. Also,
        // always lock network ID before services to avoid deadlock.
        c.networkLocker.Lock(nID)
        defer c.networkLocker.Unlock(nID) //nolint:errcheck

        n, err := c.NetworkByID(nID)
        if err != nil {
                return err
        }

        skey := serviceKey{
                id:    svcID,
                ports: portConfigs(ingressPorts).String(),
        }

        var s *service
        for {
                c.mu.Lock()
                var ok bool
                s, ok = c.serviceBindings[skey]
                if !ok {
                        // Create a new service if we are seeing this service
                        // for the first time.
                        s = newService(svcName, svcID, ingressPorts, serviceAliases)
                        c.serviceBindings[skey] = s
                }
                c.mu.Unlock()
                s.Lock()
                if !s.deleted {
                        // ok the object is good to be used
                        break
                }
                s.Unlock()
        }
        log.G(context.TODO()).Debugf("addServiceBinding from %s START for %s %s p:%p nid:%s skey:%v", method, svcName, eID, s, nID, skey)
        defer s.Unlock()

        lb, ok := s.loadBalancers[nID]
        if !ok {
                // Create a new load balancer if we are seeing this
                // network attachment on the service for the first
                // time.
                fwMarkCtrMu.Lock()

                lb = &loadBalancer{
                        vip:      vip,
                        fwMark:   fwMarkCtr,
                        backEnds: make(map[string]*lbBackend),
                        service:  s,
                }

                fwMarkCtr++
                fwMarkCtrMu.Unlock()

                s.loadBalancers[nID] = lb
                addService = true
        }

        lb.backEnds[eID] = &lbBackend{ip, false}

        ok, entries := s.assignIPToEndpoint(ip.String(), eID)
        if !ok || entries > 1 {
                setStr, b := s.printIPToEndpoint(ip.String())
                if len(setStr) > maxSetStringLen {
                        setStr = setStr[:maxSetStringLen]
                }
                log.G(context.TODO()).Warnf("addServiceBinding %s possible transient state ok:%t entries:%d set:%t %s", eID, ok, entries, b, setStr)
        }

        // Add loadbalancer service and backend to the network
        n.addLBBackend(ip, lb)

        // Add the appropriate name resolutions
        if err := c.addEndpointNameResolution(svcName, svcID, nID, eID, containerName, vip, serviceAliases, taskAliases, ip, addService, "addServiceBinding"); err != nil {
                return err
        }

        log.G(context.TODO()).Debugf("addServiceBinding from %s END for %s %s", method, svcName, eID)

        return nil
}

func (c *Controller) rmServiceBinding(svcName, svcID, nID, eID, containerName string, vip net.IP, ingressPorts []*PortConfig, serviceAliases []string, taskAliases []string, ip net.IP, method string, deleteSvcRecords bool, fullRemove bool) error {
        var rmService bool

        skey := serviceKey{
                id:    svcID,
                ports: portConfigs(ingressPorts).String(),
        }

        c.mu.Lock()
        s, ok := c.serviceBindings[skey]
        c.mu.Unlock()
        if !ok {
                log.G(context.TODO()).Warnf("rmServiceBinding %s %s %s aborted c.serviceBindings[skey] !ok", method, svcName, eID)
                return nil
        }

        s.Lock()
        defer s.Unlock()
        log.G(context.TODO()).Debugf("rmServiceBinding from %s START for %s %s p:%p nid:%s sKey:%v deleteSvc:%t", method, svcName, eID, s, nID, skey, deleteSvcRecords)
        lb, ok := s.loadBalancers[nID]
        if !ok {
                log.G(context.TODO()).Warnf("rmServiceBinding %s %s %s aborted s.loadBalancers[nid] !ok", method, svcName, eID)
                return nil
        }

        be, ok := lb.backEnds[eID]
        if !ok {
                log.G(context.TODO()).Warnf("rmServiceBinding %s %s %s aborted lb.backEnds[eid] && lb.disabled[eid] !ok", method, svcName, eID)
                return nil
        }

        if fullRemove {
                // delete regardless
                delete(lb.backEnds, eID)
        } else {
                be.disabled = true
        }

        if len(lb.backEnds) == 0 {
                // All the backends for this service have been
                // removed. Time to remove the load balancer and also
                // remove the service entry in IPVS.
                rmService = true

                delete(s.loadBalancers, nID)
                log.G(context.TODO()).Debugf("rmServiceBinding %s delete %s, p:%p in loadbalancers len:%d", eID, nID, lb, len(s.loadBalancers))
        }

        ok, entries := s.removeIPToEndpoint(ip.String(), eID)
        if !ok || entries > 0 {
                setStr, b := s.printIPToEndpoint(ip.String())
                if len(setStr) > maxSetStringLen {
                        setStr = setStr[:maxSetStringLen]
                }
                log.G(context.TODO()).Warnf("rmServiceBinding %s possible transient state ok:%t entries:%d set:%t %s", eID, ok, entries, b, setStr)
        }

        // Remove loadbalancer service(if needed) and backend in all
        // sandboxes in the network only if the vip is valid.
        if entries == 0 {
                // The network may well have been deleted from the store (and
                // dataplane) before the last of the service bindings.  On Linux that's
                // ok because removing the network sandbox from the dataplane
                // implicitly cleans up all related dataplane state.
                // On the Windows dataplane, VFP policylists must be removed
                // independently of the network, and they must be removed before the HNS
                // network. Otherwise, policylist removal fails with "network not
                // found." On Windows cleanupServiceBindings must be called prior to
                // removing the network from the store or dataplane.
                n, err := c.NetworkByID(nID)
                if err == nil {
                        n.rmLBBackend(ip, lb, rmService, fullRemove)
                }
        }

        // Delete the name resolutions
        if deleteSvcRecords {
                if err := c.deleteEndpointNameResolution(svcName, svcID, nID, eID, containerName, vip, serviceAliases, taskAliases, ip, rmService, entries > 0, "rmServiceBinding"); err != nil {
                        return err
                }
        }

        if len(s.loadBalancers) == 0 {
                // All loadbalancers for the service removed. Time to
                // remove the service itself.
                c.mu.Lock()

                // Mark the object as deleted so that the add won't use it wrongly
                s.deleted = true
                // NOTE The delete from the serviceBindings map has to be the last operation else we are allowing a race between this service
                // that is getting deleted and a new service that will be created if the entry is not anymore there
                delete(c.serviceBindings, skey)
                c.mu.Unlock()
        }

        log.G(context.TODO()).Debugf("rmServiceBinding from %s END for %s %s", method, svcName, eID)
        return nil
}

package libnetwork

import (
        "context"
        "errors"
        "fmt"
        "io"
        "net"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/drivers/bridge"
        "github.com/docker/docker/daemon/libnetwork/iptables"
        "github.com/docker/docker/daemon/libnetwork/ns"
        "github.com/ishidawataru/sctp"
        "github.com/moby/ipvs"
        "github.com/vishvananda/netlink/nl"
)

// Populate all loadbalancers on the network that the passed endpoint
// belongs to, into this sandbox.
func (sb *Sandbox) populateLoadBalancers(ep *Endpoint) {
        // This is an interface less endpoint. Nothing to do.
        if ep.Iface() == nil {
                return
        }

        n := ep.getNetwork()
        eIP := ep.Iface().Address()

        if n.ingress {
                if err := sb.addRedirectRules(eIP, ep.ingressPorts); err != nil {
                        log.G(context.TODO()).Errorf("Failed to add redirect rules for ep %s (%.7s): %v", ep.Name(), ep.ID(), err)
                }
        }
}

func (n *Network) findLBEndpointSandbox() (*Endpoint, *Sandbox, error) {
        // TODO: get endpoint from store?  See EndpointInfo()
        var ep *Endpoint
        // Find this node's LB sandbox endpoint:  there should be exactly one
        for _, e := range n.Endpoints() {
                epi := e.Info()
                if epi != nil && epi.LoadBalancer() {
                        ep = e
                        break
                }
        }
        if ep == nil {
                return nil, nil, fmt.Errorf("Unable to find load balancing endpoint for network %s", n.ID())
        }
        // Get the load balancer sandbox itself as well
        sb, ok := ep.getSandbox()
        if !ok {
                return nil, nil, fmt.Errorf("Unable to get sandbox for %s(%s) in for %s", ep.Name(), ep.ID(), n.ID())
        }
        sep := sb.GetEndpoint(ep.ID())
        if sep == nil {
                return nil, nil, fmt.Errorf("Load balancing endpoint %s(%s) removed from %s", ep.Name(), ep.ID(), n.ID())
        }
        return sep, sb, nil
}

// Searches the OS sandbox for the name of the endpoint interface
// within the sandbox.   This is required for adding/removing IP
// aliases to the interface.
func findIfaceDstName(sb *Sandbox, ep *Endpoint) string {
        srcName := ep.Iface().SrcName()
        for _, i := range sb.osSbox.Interfaces() {
                if i.SrcName() == srcName {
                        return i.DstName()
                }
        }
        return ""
}

// Add loadbalancer backend to the loadbalancer sandbox for the network.
// If needed add the service as well.
func (n *Network) addLBBackend(ip net.IP, lb *loadBalancer) {
        if len(lb.vip) == 0 {
                return
        }
        ep, sb, err := n.findLBEndpointSandbox()
        if err != nil {
                log.G(context.TODO()).Errorf("addLBBackend %s/%s: %v", n.ID(), n.Name(), err)
                return
        }
        if sb.osSbox == nil {
                return
        }

        eIP := ep.Iface().Address()

        i, err := ipvs.New(sb.Key())
        if err != nil {
                log.G(context.TODO()).Errorf("Failed to create an ipvs handle for sbox %.7s (%.7s,%s) for lb addition: %v", sb.ID(), sb.ContainerID(), sb.Key(), err)
                return
        }
        defer i.Close()

        s := &ipvs.Service{
                AddressFamily: nl.FAMILY_V4,
                FWMark:        lb.fwMark,
                SchedName:     ipvs.RoundRobin,
        }

        if !i.IsServicePresent(s) {
                // Add IP alias for the VIP to the endpoint
                ifName := findIfaceDstName(sb, ep)
                if ifName == "" {
                        log.G(context.TODO()).Errorf("Failed find interface name for endpoint %s(%s) to create LB alias", ep.ID(), ep.Name())
                        return
                }
                err := sb.osSbox.AddAliasIP(ifName, &net.IPNet{IP: lb.vip, Mask: net.CIDRMask(32, 32)})
                if err != nil {
                        log.G(context.TODO()).Errorf("Failed add IP alias %s to network %s LB endpoint interface %s: %v", lb.vip, n.ID(), ifName, err)
                        return
                }

                if sb.ingress {
                        var gwIP net.IP
                        if gwEP, _ := sb.getGatewayEndpoint(); gwEP != nil {
                                gwIP = gwEP.Iface().Address().IP
                        }
                        if err := programIngress(gwIP, lb.service.ingressPorts, false); err != nil {
                                log.G(context.TODO()).Errorf("Failed to add ingress: %v", err)
                                return
                        }
                }

                log.G(context.TODO()).Debugf("Creating service for vip %s fwMark %d ingressPorts %#v in sbox %.7s (%.7s)", lb.vip, lb.fwMark, lb.service.ingressPorts, sb.ID(), sb.ContainerID())
                if err := sb.configureFWMark(lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, false, n.loadBalancerMode); err != nil {
                        log.G(context.TODO()).Errorf("Failed to add firewall mark rule in sbox %.7s (%.7s): %v", sb.ID(), sb.ContainerID(), err)
                        return
                }

                if err := i.NewService(s); err != nil && !errors.Is(err, syscall.EEXIST) {
                        log.G(context.TODO()).Errorf("Failed to create a new service for vip %s fwmark %d in sbox %.7s (%.7s): %v", lb.vip, lb.fwMark, sb.ID(), sb.ContainerID(), err)
                        return
                }
        }

        // Remove the sched name before using the service to add
        // destination.
        s.SchedName = ""

        var flags uint32
        if n.loadBalancerMode == loadBalancerModeDSR {
                flags = ipvs.ConnFwdDirectRoute
        }
        err = i.NewDestination(s, &ipvs.Destination{
                AddressFamily:   nl.FAMILY_V4,
                Address:         ip,
                Weight:          1,
                ConnectionFlags: flags,
        })
        if err != nil && !errors.Is(err, syscall.EEXIST) {
                log.G(context.TODO()).Errorf("Failed to create real server %s for vip %s fwmark %d in sbox %.7s (%.7s): %v", ip, lb.vip, lb.fwMark, sb.ID(), sb.ContainerID(), err)
        }

        // Ensure that kernel tweaks are applied in case this is the first time
        // we've initialized ip_vs
        sb.osSbox.ApplyOSTweaks(sb.oslTypes)
}

// Remove loadbalancer backend the load balancing endpoint for this
// network. If 'rmService' is true, then remove the service entry as well.
// If 'fullRemove' is true then completely remove the entry, otherwise
// just deweight it for now.
func (n *Network) rmLBBackend(ip net.IP, lb *loadBalancer, rmService bool, fullRemove bool) {
        if len(lb.vip) == 0 {
                return
        }
        ep, sb, err := n.findLBEndpointSandbox()
        if err != nil {
                log.G(context.TODO()).Debugf("rmLBBackend for %s/%s: %v -- probably transient state", n.ID(), n.Name(), err)
                return
        }
        if sb.osSbox == nil {
                return
        }

        eIP := ep.Iface().Address()

        i, err := ipvs.New(sb.Key())
        if err != nil {
                log.G(context.TODO()).Errorf("Failed to create an ipvs handle for sbox %.7s (%.7s,%s) for lb removal: %v", sb.ID(), sb.ContainerID(), sb.Key(), err)
                return
        }
        defer i.Close()

        s := &ipvs.Service{
                AddressFamily: nl.FAMILY_V4,
                FWMark:        lb.fwMark,
        }

        d := &ipvs.Destination{
                AddressFamily: nl.FAMILY_V4,
                Address:       ip,
                Weight:        1,
        }
        if n.loadBalancerMode == loadBalancerModeDSR {
                d.ConnectionFlags = ipvs.ConnFwdDirectRoute
        }

        if fullRemove {
                if err := i.DelDestination(s, d); err != nil && !errors.Is(err, syscall.ENOENT) {
                        log.G(context.TODO()).Errorf("Failed to delete real server %s for vip %s fwmark %d in sbox %.7s (%.7s): %v", ip, lb.vip, lb.fwMark, sb.ID(), sb.ContainerID(), err)
                }
        } else {
                d.Weight = 0
                if err := i.UpdateDestination(s, d); err != nil && !errors.Is(err, syscall.ENOENT) {
                        log.G(context.TODO()).Errorf("Failed to set LB weight of real server %s to 0 for vip %s fwmark %d in sbox %.7s (%.7s): %v", ip, lb.vip, lb.fwMark, sb.ID(), sb.ContainerID(), err)
                }
        }

        if rmService {
                s.SchedName = ipvs.RoundRobin
                if err := i.DelService(s); err != nil && !errors.Is(err, syscall.ENOENT) {
                        log.G(context.TODO()).Errorf("Failed to delete service for vip %s fwmark %d in sbox %.7s (%.7s): %v", lb.vip, lb.fwMark, sb.ID(), sb.ContainerID(), err)
                }

                if sb.ingress {
                        var gwIP net.IP
                        if gwEP, _ := sb.getGatewayEndpoint(); gwEP != nil {
                                gwIP = gwEP.Iface().Address().IP
                        }
                        if err := programIngress(gwIP, lb.service.ingressPorts, true); err != nil {
                                log.G(context.TODO()).Errorf("Failed to delete ingress: %v", err)
                        }
                }

                if err := sb.configureFWMark(lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, true, n.loadBalancerMode); err != nil {
                        log.G(context.TODO()).Errorf("Failed to delete firewall mark rule in sbox %.7s (%.7s): %v", sb.ID(), sb.ContainerID(), err)
                }

                // Remove IP alias from the VIP to the endpoint
                ifName := findIfaceDstName(sb, ep)
                if ifName == "" {
                        log.G(context.TODO()).Errorf("Failed find interface name for endpoint %s(%s) to create LB alias", ep.ID(), ep.Name())
                        return
                }
                err := sb.osSbox.RemoveAliasIP(ifName, &net.IPNet{IP: lb.vip, Mask: net.CIDRMask(32, 32)})
                if err != nil {
                        log.G(context.TODO()).Errorf("Failed add IP alias %s to network %s LB endpoint interface %s: %v", lb.vip, n.ID(), ifName, err)
                }
        }
}

const ingressChain = "DOCKER-INGRESS"

var (
        ingressOnce     sync.Once
        ingressMu       sync.Mutex // lock for operations on ingress
        ingressProxyTbl = make(map[string]io.Closer)
        portConfigMu    sync.Mutex
        portConfigTbl   = make(map[PortConfig]int)
)

func filterPortConfigs(ingressPorts []*PortConfig, isDelete bool) []*PortConfig {
        portConfigMu.Lock()
        iPorts := make([]*PortConfig, 0, len(ingressPorts))
        for _, pc := range ingressPorts {
                if isDelete {
                        if cnt, ok := portConfigTbl[*pc]; ok {
                                // This is the last reference to this
                                // port config. Delete the port config
                                // and add it to filtered list to be
                                // plumbed.
                                if cnt == 1 {
                                        delete(portConfigTbl, *pc)
                                        iPorts = append(iPorts, pc)
                                        continue
                                }

                                portConfigTbl[*pc] = cnt - 1
                        }

                        continue
                }

                if cnt, ok := portConfigTbl[*pc]; ok {
                        portConfigTbl[*pc] = cnt + 1
                        continue
                }

                // We are adding it for the first time. Add it to the
                // filter list to be plumbed.
                portConfigTbl[*pc] = 1
                iPorts = append(iPorts, pc)
        }
        portConfigMu.Unlock()

        return iPorts
}

func programIngress(gwIP net.IP, ingressPorts []*PortConfig, isDelete bool) error {
        // TODO IPv6 support
        iptable := iptables.GetIptable(iptables.IPv4)

        addDelOpt := "-I"
        rollbackAddDelOpt := "-D"
        if isDelete {
                addDelOpt = "-D"
                rollbackAddDelOpt = "-I"
        }

        ingressMu.Lock()
        defer ingressMu.Unlock()

        chainExists := iptable.ExistChain(ingressChain, iptables.Nat)
        filterChainExists := iptable.ExistChain(ingressChain, iptables.Filter)

        ingressOnce.Do(func() {
                // Flush nat table and filter table ingress chain rules during init if it
                // exists. It might contain stale rules from previous life.
                if chainExists {
                        if err := iptable.RawCombinedOutput("-t", "nat", "-F", ingressChain); err != nil {
                                log.G(context.TODO()).Errorf("Could not flush nat table ingress chain rules during init: %v", err)
                        }
                }
                if filterChainExists {
                        if err := iptable.RawCombinedOutput("-F", ingressChain); err != nil {
                                log.G(context.TODO()).Errorf("Could not flush filter table ingress chain rules during init: %v", err)
                        }
                }
        })

        if !isDelete {
                if !chainExists {
                        if err := iptable.RawCombinedOutput("-t", "nat", "-N", ingressChain); err != nil {
                                return fmt.Errorf("failed to create ingress chain: %v", err)
                        }
                }
                if !filterChainExists {
                        if err := iptable.RawCombinedOutput("-N", ingressChain); err != nil {
                                return fmt.Errorf("failed to create filter table ingress chain: %v", err)
                        }
                }

                if !iptable.Exists(iptables.Nat, ingressChain, "-j", "RETURN") {
                        if err := iptable.RawCombinedOutput("-t", "nat", "-A", ingressChain, "-j", "RETURN"); err != nil {
                                return fmt.Errorf("failed to add return rule in nat table ingress chain: %v", err)
                        }
                }

                if !iptable.Exists(iptables.Filter, ingressChain, "-j", "RETURN") {
                        if err := iptable.RawCombinedOutput("-A", ingressChain, "-j", "RETURN"); err != nil {
                                return fmt.Errorf("failed to add return rule to filter table ingress chain: %v", err)
                        }
                }

                for _, chain := range []string{"OUTPUT", "PREROUTING"} {
                        if !iptable.Exists(iptables.Nat, chain, "-m", "addrtype", "--dst-type", "LOCAL", "-j", ingressChain) {
                                if err := iptable.RawCombinedOutput("-t", "nat", "-I", chain, "-m", "addrtype", "--dst-type", "LOCAL", "-j", ingressChain); err != nil {
                                        return fmt.Errorf("failed to add jump rule in %s to ingress chain: %v", chain, err)
                                }
                        }
                }

                // The DOCKER-FORWARD chain is created by the bridge driver on startup. It's a stable place to
                // put the jump to DOCKER-INGRESS (nothing else will ever be inserted before it, and the jump
                // will precede the bridge driver's other rules).
                if !iptable.Exists(iptables.Filter, bridge.DockerForwardChain, "-j", ingressChain) {
                        if err := iptable.RawCombinedOutput("-I", bridge.DockerForwardChain, "-j", ingressChain); err != nil {
                                return fmt.Errorf("failed to add jump rule to %s in filter table %s chain: %v",
                                        ingressChain, bridge.DockerForwardChain, err)
                        }
                }
                // Remove the jump from FORWARD to DOCKER-INGRESS, if it was created there by a version of
                // the daemon older than 28.0.1.
                // FIXME(robmry) - should only do this once, on startup.
                if iptable.Exists(iptables.Filter, "FORWARD", "-j", ingressChain) {
                        if err := iptable.RawCombinedOutput("-D", "FORWARD", "-j", ingressChain); err != nil {
                                log.G(context.TODO()).WithError(err).Debug("Failed to delete jump from FORWARD to " + ingressChain)
                        }
                }

                oifName, err := findOIFName(gwIP)
                if err != nil {
                        return fmt.Errorf("failed to find gateway bridge interface name for %s: %v", gwIP, err)
                }

                path := filepath.Join("/proc/sys/net/ipv4/conf", oifName, "route_localnet")
                if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil { //nolint:gosec // gosec complains about perms here, which must be 0644 in this case
                        return fmt.Errorf("could not write to %s: %v", path, err)
                }

                ruleArgs := []string{"-m", "addrtype", "--src-type", "LOCAL", "-o", oifName, "-j", "MASQUERADE"}
                if !iptable.Exists(iptables.Nat, "POSTROUTING", ruleArgs...) {
                        if err := iptable.RawCombinedOutput(append([]string{"-t", "nat", "-I", "POSTROUTING"}, ruleArgs...)...); err != nil {
                                return fmt.Errorf("failed to add ingress localhost POSTROUTING rule for %s: %v", oifName, err)
                        }
                }
        }

        // Filter the ingress ports until port rules start to be added/deleted
        filteredPorts := filterPortConfigs(ingressPorts, isDelete)
        rollbackRules := make([][]string, 0, len(filteredPorts)*3)
        var portErr error
        defer func() {
                if portErr != nil && !isDelete {
                        filterPortConfigs(filteredPorts, !isDelete)
                        for _, rule := range rollbackRules {
                                if err := iptable.RawCombinedOutput(rule...); err != nil {
                                        log.G(context.TODO()).Warnf("roll back rule failed, %v: %v", rule, err)
                                }
                        }
                }
        }()

        for _, iPort := range filteredPorts {
                var (
                        protocol      = strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)])
                        publishedPort = strconv.FormatUint(uint64(iPort.PublishedPort), 10)
                        destination   = net.JoinHostPort(gwIP.String(), publishedPort)
                )
                if iptable.ExistChain(ingressChain, iptables.Nat) {
                        rule := []string{"-t", "nat", addDelOpt, ingressChain, "-p", protocol, "--dport", publishedPort, "-j", "DNAT", "--to-destination", destination}

                        if portErr = iptable.RawCombinedOutput(rule...); portErr != nil {
                                err := fmt.Errorf("set up rule failed, %v: %v", rule, portErr)
                                if !isDelete {
                                        return err
                                }
                                log.G(context.TODO()).Info(err)
                        }
                        rollbackRule := []string{"-t", "nat", rollbackAddDelOpt, ingressChain, "-p", protocol, "--dport", publishedPort, "-j", "DNAT", "--to-destination", destination}
                        rollbackRules = append(rollbackRules, rollbackRule)
                }

                // Filter table rules to allow a published service to be accessible in the local node from..
                // 1) service tasks attached to other networks
                // 2) unmanaged containers on bridge networks
                rule := []string{addDelOpt, ingressChain, "-p", protocol, "--sport", publishedPort, "-m", "conntrack", "--ctstate", "ESTABLISHED,RELATED", "-j", "ACCEPT"}
                if portErr = iptable.RawCombinedOutput(rule...); portErr != nil {
                        err := fmt.Errorf("set up rule failed, %v: %v", rule, portErr)
                        if !isDelete {
                                return err
                        }
                        log.G(context.TODO()).Warn(err)
                }
                rollbackRule := []string{rollbackAddDelOpt, ingressChain, "-p", protocol, "--sport", publishedPort, "-m", "conntrack", "--ctstate", "ESTABLISHED,RELATED", "-j", "ACCEPT"}
                rollbackRules = append(rollbackRules, rollbackRule)

                rule = []string{addDelOpt, ingressChain, "-p", protocol, "--dport", publishedPort, "-j", "ACCEPT"}
                if portErr = iptable.RawCombinedOutput(rule...); portErr != nil {
                        err := fmt.Errorf("set up rule failed, %v: %v", rule, portErr)
                        if !isDelete {
                                return err
                        }
                        log.G(context.TODO()).Warn(err)
                }
                rollbackRule = []string{rollbackAddDelOpt, ingressChain, "-p", protocol, "--dport", publishedPort, "-j", "ACCEPT"}
                rollbackRules = append(rollbackRules, rollbackRule)

                if err := plumbProxy(iPort, isDelete); err != nil {
                        log.G(context.TODO()).Warnf("failed to create proxy for port %s: %v", publishedPort, err)
                }
        }

        return nil
}

func findOIFName(ip net.IP) (string, error) {
        nlh := ns.NlHandle()

        routes, err := nlh.RouteGet(ip)
        if err != nil {
                return "", err
        }

        if len(routes) == 0 {
                return "", fmt.Errorf("no route to %s", ip)
        }

        // Pick the first route(typically there is only one route). We
        // don't support multipath.
        link, err := nlh.LinkByIndex(routes[0].LinkIndex)
        if err != nil {
                return "", err
        }

        return link.Attrs().Name, nil
}

func plumbProxy(iPort *PortConfig, isDelete bool) error {
        var (
                err error
                l   io.Closer
        )

        portSpec := fmt.Sprintf("%d/%s", iPort.PublishedPort, strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)]))
        if isDelete {
                if listener, ok := ingressProxyTbl[portSpec]; ok {
                        if listener != nil {
                                listener.Close()
                        }
                }

                return nil
        }

        switch iPort.Protocol {
        case ProtocolTCP:
                l, err = net.ListenTCP("tcp", &net.TCPAddr{Port: int(iPort.PublishedPort)})
        case ProtocolUDP:
                l, err = net.ListenUDP("udp", &net.UDPAddr{Port: int(iPort.PublishedPort)})
        case ProtocolSCTP:
                l, err = sctp.ListenSCTP("sctp", &sctp.SCTPAddr{Port: int(iPort.PublishedPort)})
        default:
                err = fmt.Errorf("unknown protocol %v", iPort.Protocol)
        }

        if err != nil {
                return err
        }

        ingressProxyTbl[portSpec] = l

        return nil
}

// configureFWMark configures the sandbox firewall to mark vip destined packets
// with the firewall mark fwMark.
func (sb *Sandbox) configureFWMark(vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, isDelete bool, lbMode string) error {
        // TODO IPv6 support
        iptable := iptables.GetIptable(iptables.IPv4)

        fwMarkStr := strconv.FormatUint(uint64(fwMark), 10)
        addDelOpt := "-A"
        if isDelete {
                addDelOpt = "-D"
        }

        rules := make([][]string, 0, len(ingressPorts))
        for _, iPort := range ingressPorts {
                var (
                        protocol      = strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)])
                        publishedPort = strconv.FormatUint(uint64(iPort.PublishedPort), 10)
                )
                rule := []string{"-t", "mangle", addDelOpt, "PREROUTING", "-p", protocol, "--dport", publishedPort, "-j", "MARK", "--set-mark", fwMarkStr}
                rules = append(rules, rule)
        }

        var innerErr error
        err := sb.ExecFunc(func() {
                if !isDelete && lbMode == loadBalancerModeNAT {
                        subnet := net.IPNet{IP: eIP.IP.Mask(eIP.Mask), Mask: eIP.Mask}
                        ruleParams := []string{"-m", "ipvs", "--ipvs", "-d", subnet.String(), "-j", "SNAT", "--to-source", eIP.IP.String()}
                        if !iptable.Exists("nat", "POSTROUTING", ruleParams...) {
                                rule := append([]string{"-t", "nat", "-A", "POSTROUTING"}, ruleParams...)
                                rules = append(rules, rule)

                                err := os.WriteFile("/proc/sys/net/ipv4/vs/conntrack", []byte{'1', '\n'}, 0o644)
                                if err != nil {
                                        innerErr = err
                                        return
                                }
                        }
                }

                rule := []string{"-t", "mangle", addDelOpt, "INPUT", "-d", vip.String() + "/32", "-j", "MARK", "--set-mark", fwMarkStr}
                rules = append(rules, rule)

                for _, rule := range rules {
                        if err := iptable.RawCombinedOutputNative(rule...); err != nil {
                                innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
                                return
                        }
                }
        })
        if err != nil {
                return err
        }
        return innerErr
}

func (sb *Sandbox) addRedirectRules(eIP *net.IPNet, ingressPorts []*PortConfig) error {
        // TODO IPv6 support
        iptable := iptables.GetIptable(iptables.IPv4)
        ipAddr := eIP.IP.String()

        rules := make([][]string, 0, len(ingressPorts)*3) // 3 rules per port
        for _, iPort := range ingressPorts {
                var (
                        protocol      = strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)])
                        publishedPort = strconv.FormatUint(uint64(iPort.PublishedPort), 10)
                        targetPort    = strconv.FormatUint(uint64(iPort.TargetPort), 10)
                )

                rules = append(rules,
                        []string{"-t", "nat", "-A", "PREROUTING", "-d", ipAddr, "-p", protocol, "--dport", publishedPort, "-j", "REDIRECT", "--to-port", targetPort},

                        // Allow only incoming connections to exposed ports
                        []string{"-I", "INPUT", "-d", ipAddr, "-p", protocol, "--dport", targetPort, "-m", "conntrack", "--ctstate", "NEW,ESTABLISHED", "-j", "ACCEPT"},

                        // Allow only outgoing connections from exposed ports
                        []string{"-I", "OUTPUT", "-s", ipAddr, "-p", protocol, "--sport", targetPort, "-m", "conntrack", "--ctstate", "ESTABLISHED", "-j", "ACCEPT"},
                )
        }

        var innerErr error
        err := sb.ExecFunc(func() {
                for _, rule := range rules {
                        if err := iptable.RawCombinedOutputNative(rule...); err != nil {
                                innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
                                return
                        }
                }

                if len(ingressPorts) == 0 {
                        return
                }

                // Ensure blocking rules for anything else in/to ingress network
                for _, rule := range [][]string{
                        {"-d", ipAddr, "-p", "sctp", "-j", "DROP"},
                        {"-d", ipAddr, "-p", "udp", "-j", "DROP"},
                        {"-d", ipAddr, "-p", "tcp", "-j", "DROP"},
                } {
                        if !iptable.ExistsNative(iptables.Filter, "INPUT", rule...) {
                                if err := iptable.RawCombinedOutputNative(append([]string{"-A", "INPUT"}, rule...)...); err != nil {
                                        innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
                                        return
                                }
                        }
                        rule[0] = "-s"
                        if !iptable.ExistsNative(iptables.Filter, "OUTPUT", rule...) {
                                if err := iptable.RawCombinedOutputNative(append([]string{"-A", "OUTPUT"}, rule...)...); err != nil {
                                        innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
                                        return
                                }
                        }
                }
        })
        if err != nil {
                return err
        }
        return innerErr
}

package libnetwork

import (
        "context"
        "errors"
        "fmt"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/libnetwork/datastore"
        "github.com/docker/docker/daemon/libnetwork/scope"
        "go.opentelemetry.io/otel"
)

func (c *Controller) getNetworkFromStore(nid string) (*Network, error) {
        for _, n := range c.getNetworksFromStore(context.TODO()) {
                if n.id == nid {
                        return n, nil
                }
        }
        return nil, ErrNoSuchNetwork(nid)
}

func (c *Controller) getNetworks() ([]*Network, error) {
        var nl []*Network

        kvol, err := c.store.List(&Network{ctrlr: c})
        if err != nil && !errors.Is(err, datastore.ErrKeyNotFound) {
                return nil, fmt.Errorf("failed to get networks: %w", err)
        }

        for _, kvo := range kvol {
                n := kvo.(*Network)
                n.ctrlr = c
                c.cacheNetwork(n)
                if n.scope == "" {
                        n.scope = scope.Local
                }
                nl = append(nl, n)
        }

        return nl, nil
}

func (c *Controller) getNetworksFromStore(ctx context.Context) []*Network { // FIXME: unify with c.getNetworks()
        var nl []*Network

        kvol, err := c.store.List(&Network{ctrlr: c})
        if err != nil {
                if !errors.Is(err, datastore.ErrKeyNotFound) {
                        log.G(ctx).Debugf("failed to get networks from store: %v", err)
                }
                return nil
        }

        for _, kvo := range kvol {
                n := kvo.(*Network)
                n.mu.Lock()
                n.ctrlr = c
                if n.scope == "" {
                        n.scope = scope.Local
                }
                n.mu.Unlock()
                nl = append(nl, n)
        }

        return nl
}

func (n *Network) getEndpointFromStore(eid string) (*Endpoint, error) {
        ep := &Endpoint{id: eid, network: n}
        err := n.ctrlr.store.GetObject(ep)
        if err != nil {
                return nil, fmt.Errorf("could not find endpoint %s: %w", eid, err)
        }
        n.ctrlr.cacheEndpoint(ep)
        return ep, nil
}

func (n *Network) getEndpointsFromStore() ([]*Endpoint, error) {
        var epl []*Endpoint

        kvol, err := n.getController().store.List(&Endpoint{network: n})
        if err != nil {
                if !errors.Is(err, datastore.ErrKeyNotFound) {
                        return nil, fmt.Errorf("failed to get endpoints for network %s: %w",
                                n.Name(), err)
                }
                return nil, nil
        }

        for _, kvo := range kvol {
                ep := kvo.(*Endpoint)
                epl = append(epl, ep)
                n.ctrlr.cacheEndpoint(ep)
        }

        return epl, nil
}

func (c *Controller) updateToStore(ctx context.Context, kvObject datastore.KVObject) error {
        ctx, span := otel.Tracer("").Start(ctx, "libnetwork.Controller.updateToStore")
        defer span.End()

        if err := c.store.PutObjectAtomic(kvObject); err != nil {
                if errors.Is(err, datastore.ErrKeyModified) {
                        return err
                }
                return fmt.Errorf("failed to update store for object type %T: %v", kvObject, err)
        }

        return nil
}

func (c *Controller) deleteFromStore(kvObject datastore.KVObject) error {
retry:
        if err := c.store.DeleteObjectAtomic(kvObject); err != nil {
                if errors.Is(err, datastore.ErrKeyModified) {
                        if err := c.store.GetObject(kvObject); err != nil {
                                return fmt.Errorf("could not update the kvobject to latest when trying to delete: %v", err)
                        }
                        log.G(context.TODO()).Warnf("Error (%v) deleting object %v, retrying....", err, kvObject.Key())
                        goto retry
                }
                return err
        }

        return nil
}

func (c *Controller) networkCleanup() {
        for _, n := range c.getNetworksFromStore(context.TODO()) {
                if n.inDelete {
                        log.G(context.TODO()).Infof("Removing stale network %s (%s)", n.Name(), n.ID())
                        if err := n.delete(true, true); err != nil {
                                log.G(context.TODO()).Debugf("Error while removing stale network: %v", err)
                        }
                }
        }
}

// Package types contains types that are common across libnetwork project
package types

import (
        "bytes"
        "fmt"
        "net"
        "strconv"
        "strings"

        "github.com/docker/docker/errdefs"
        "github.com/ishidawataru/sctp"
)

// constants for the IP address type
// Deprecated: use the consts defined in github.com/docker/docker/libnetwork/resolvconf
const (
        IP = iota // IPv4 and IPv6
        IPv4
        IPv6
)

// EncryptionKey is the libnetwork representation of the key distributed by the lead
// manager.
type EncryptionKey struct {
        Subsystem   string
        Algorithm   int32
        Key         []byte
        LamportTime uint64
}

// QosPolicy represents a quality of service policy on an endpoint
type QosPolicy struct {
        MaxEgressBandwidth uint64
}

// TransportPort represents a local Layer 4 endpoint
type TransportPort struct {
        Proto Protocol
        Port  uint16
}

// Equal checks if this instance of TransportPort is equal to the passed one
func (t *TransportPort) Equal(o *TransportPort) bool {
        if t == o {
                return true
        }

        if o == nil {
                return false
        }

        if t.Proto != o.Proto || t.Port != o.Port {
                return false
        }

        return true
}

// GetCopy returns a copy of this TransportPort structure instance
func (t *TransportPort) GetCopy() TransportPort {
        return TransportPort{Proto: t.Proto, Port: t.Port}
}

// String returns the TransportPort structure in string form
func (t *TransportPort) String() string {
        return fmt.Sprintf("%s/%d", t.Proto.String(), t.Port)
}

// PortBinding represents a port binding between the container and the host
type PortBinding struct {
        Proto       Protocol
        IP          net.IP
        Port        uint16
        HostIP      net.IP
        HostPort    uint16
        HostPortEnd uint16
}

// HostAddr returns the host side transport address
func (p PortBinding) HostAddr() (net.Addr, error) {
        switch p.Proto {
        case UDP:
                return &net.UDPAddr{IP: p.HostIP, Port: int(p.HostPort)}, nil
        case TCP:
                return &net.TCPAddr{IP: p.HostIP, Port: int(p.HostPort)}, nil
        case SCTP:
                return &sctp.SCTPAddr{IPAddrs: []net.IPAddr{{IP: p.HostIP}}, Port: int(p.HostPort)}, nil
        default:
                return nil, fmt.Errorf("invalid transport protocol: %s", p.Proto.String())
        }
}

// ContainerAddr returns the container side transport address
func (p PortBinding) ContainerAddr() (net.Addr, error) {
        switch p.Proto {
        case UDP:
                return &net.UDPAddr{IP: p.IP, Port: int(p.Port)}, nil
        case TCP:
                return &net.TCPAddr{IP: p.IP, Port: int(p.Port)}, nil
        case SCTP:
                return &sctp.SCTPAddr{IPAddrs: []net.IPAddr{{IP: p.IP}}, Port: int(p.Port)}, nil
        default:
                return nil, fmt.Errorf("invalid transport protocol: %s", p.Proto.String())
        }
}

// GetCopy returns a copy of this PortBinding structure instance
func (p *PortBinding) GetCopy() PortBinding {
        return PortBinding{
                Proto:       p.Proto,
                IP:          GetIPCopy(p.IP),
                Port:        p.Port,
                HostIP:      GetIPCopy(p.HostIP),
                HostPort:    p.HostPort,
                HostPortEnd: p.HostPortEnd,
        }
}

// Equal returns true if o has the same values as p, else false.
func (p *PortBinding) Equal(o *PortBinding) bool {
        return p.Proto == o.Proto &&
                p.IP.Equal(o.IP) &&
                p.Port == o.Port &&
                p.HostIP.Equal(o.HostIP) &&
                p.HostPort == o.HostPort &&
                p.HostPortEnd == o.HostPortEnd
}

// String returns the PortBinding structure in the form "HostIP:HostPort:IP:Port/Proto",
// omitting un-set fields apart from Port.
func (p PortBinding) String() string {
        var ret strings.Builder
        if len(p.HostIP) > 0 {
                is6 := p.HostIP.To4() == nil
                if is6 {
                        ret.WriteRune('[')
                }
                ret.WriteString(p.HostIP.String())
                if is6 {
                        ret.WriteRune(']')
                }
                ret.WriteRune(':')
        }
        if p.HostPort != 0 {
                ret.WriteString(strconv.Itoa(int(p.HostPort)))
                if p.HostPortEnd != 0 && p.HostPortEnd != p.HostPort {
                        ret.WriteRune('-')
                        ret.WriteString(strconv.Itoa(int(p.HostPortEnd)))
                }
        }
        if ret.Len() > 0 {
                ret.WriteRune(':')
        }
        if len(p.IP) > 0 {
                is6 := p.IP.To4() == nil
                if is6 {
                        ret.WriteRune('[')
                }
                ret.WriteString(p.IP.String())
                if is6 {
                        ret.WriteRune(']')
                }
                ret.WriteRune(':')
        }
        ret.WriteString(strconv.Itoa(int(p.Port)))
        if p.Proto != 0 {
                ret.WriteRune('/')
                ret.WriteString(p.Proto.String())
        }
        return ret.String()
}

const (
        // ICMP is for the ICMP ip protocol
        ICMP = 1
        // TCP is for the TCP ip protocol
        TCP = 6
        // UDP is for the UDP ip protocol
        UDP = 17
        // SCTP is for the SCTP ip protocol
        SCTP = 132
)

// Protocol represents an IP protocol number
type Protocol uint8

func (p Protocol) String() string {
        switch p {
        case ICMP:
                return "icmp"
        case TCP:
                return "tcp"
        case UDP:
                return "udp"
        case SCTP:
                return "sctp"
        default:
                return strconv.Itoa(int(p))
        }
}

// ParseProtocol returns the respective Protocol type for the passed string
func ParseProtocol(s string) Protocol {
        switch strings.ToLower(s) {
        case "icmp":
                return ICMP
        case "udp":
                return UDP
        case "tcp":
                return TCP
        case "sctp":
                return SCTP
        default:
                return 0
        }
}

// GetMacCopy returns a copy of the passed MAC address
func GetMacCopy(from net.HardwareAddr) net.HardwareAddr {
        if from == nil {
                return nil
        }
        to := make(net.HardwareAddr, len(from))
        copy(to, from)
        return to
}

// GetIPCopy returns a copy of the passed IP address
func GetIPCopy(from net.IP) net.IP {
        if from == nil {
                return nil
        }
        to := make(net.IP, len(from))
        copy(to, from)
        return to
}

// GetIPNetCopy returns a copy of the passed IP Network
func GetIPNetCopy(from *net.IPNet) *net.IPNet {
        if from == nil {
                return nil
        }
        bm := make(net.IPMask, len(from.Mask))
        copy(bm, from.Mask)
        return &net.IPNet{IP: GetIPCopy(from.IP), Mask: bm}
}

// GetIPNetCanonical returns the canonical form for the passed network
func GetIPNetCanonical(nw *net.IPNet) *net.IPNet {
        if nw == nil {
                return nil
        }
        c := GetIPNetCopy(nw)
        c.IP = c.IP.Mask(nw.Mask)
        return c
}

// CompareIPNet returns equal if the two IP Networks are equal
func CompareIPNet(a, b *net.IPNet) bool {
        if a == b {
                return true
        }
        if a == nil || b == nil {
                return false
        }
        return a.IP.Equal(b.IP) && bytes.Equal(a.Mask, b.Mask)
}

// IsIPNetValid returns true if the ipnet is a valid network/mask
// combination. Otherwise returns false.
func IsIPNetValid(nw *net.IPNet) bool {
        return nw.String() != "0.0.0.0/0"
}

var v4inV6MaskPrefix = []byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}

// compareIPMask checks if the passed ip and mask are semantically compatible.
// It returns the byte indexes for the address and mask so that caller can
// do bitwise operations without modifying address representation.
func compareIPMask(ip net.IP, mask net.IPMask) (is int, ms int, _ error) {
        // Find the effective starting of address and mask
        if len(ip) == net.IPv6len && ip.To4() != nil {
                is = 12
        }
        if len(ip[is:]) == net.IPv4len && len(mask) == net.IPv6len && bytes.Equal(mask[:12], v4inV6MaskPrefix) {
                ms = 12
        }
        // Check if address and mask are semantically compatible
        if len(ip[is:]) != len(mask[ms:]) {
                return 0, 0, fmt.Errorf("ip and mask are not compatible: (%s, %s)", ip, mask)
        }
        return is, ms, nil
}

// GetHostPartIP returns the host portion of the ip address identified by the mask.
// IP address representation is not modified. If address and mask are not compatible
// an error is returned.
func GetHostPartIP(ip net.IP, mask net.IPMask) (net.IP, error) {
        // Find the effective starting of address and mask
        is, ms, err := compareIPMask(ip, mask)
        if err != nil {
                return nil, fmt.Errorf("cannot compute host portion ip address because %s", err)
        }

        // Compute host portion
        out := GetIPCopy(ip)
        for i := 0; i < len(mask[ms:]); i++ {
                out[is+i] &= ^mask[ms+i]
        }

        return out, nil
}

// GetBroadcastIP returns the broadcast ip address for the passed network (ip and mask).
// IP address representation is not modified. If address and mask are not compatible
// an error is returned.
func GetBroadcastIP(ip net.IP, mask net.IPMask) (net.IP, error) {
        // Find the effective starting of address and mask
        is, ms, err := compareIPMask(ip, mask)
        if err != nil {
                return nil, fmt.Errorf("cannot compute broadcast ip address because %s", err)
        }

        // Compute broadcast address
        out := GetIPCopy(ip)
        for i := 0; i < len(mask[ms:]); i++ {
                out[is+i] |= ^mask[ms+i]
        }

        return out, nil
}

// ParseCIDR returns the *net.IPNet represented by the passed CIDR notation
func ParseCIDR(cidr string) (*net.IPNet, error) {
        ip, ipNet, err := net.ParseCIDR(cidr)
        if err != nil {
                return nil, err
        }
        ipNet.IP = ip
        return ipNet, nil
}

const (
        // NEXTHOP indicates a StaticRoute with an IP next hop.
        NEXTHOP = iota

        // CONNECTED indicates a StaticRoute with an interface for directly connected peers.
        CONNECTED
)

// StaticRoute is a statically-provisioned IP route.
type StaticRoute struct {
        Destination *net.IPNet

        RouteType int // NEXT_HOP or CONNECTED

        // NextHop will be resolved by the kernel (i.e. as a loose hop).
        NextHop net.IP
}

// GetCopy returns a copy of this StaticRoute structure
func (r *StaticRoute) GetCopy() *StaticRoute {
        d := GetIPNetCopy(r.Destination)
        nh := GetIPCopy(r.NextHop)
        return &StaticRoute{
                Destination: d,
                RouteType:   r.RouteType,
                NextHop:     nh,
        }
}

// InterfaceStatistics represents the interface's statistics
type InterfaceStatistics struct {
        RxBytes   uint64
        RxPackets uint64
        RxErrors  uint64
        RxDropped uint64
        TxBytes   uint64
        TxPackets uint64
        TxErrors  uint64
        TxDropped uint64
}

func (is *InterfaceStatistics) String() string {
        return fmt.Sprintf("\nRxBytes: %d, RxPackets: %d, RxErrors: %d, RxDropped: %d, TxBytes: %d, TxPackets: %d, TxErrors: %d, TxDropped: %d",
                is.RxBytes, is.RxPackets, is.RxErrors, is.RxDropped, is.TxBytes, is.TxPackets, is.TxErrors, is.TxDropped)
}

/******************************
 * Well-known Error Interfaces
 ******************************/

// MaskableError is an interface for errors which can be ignored by caller
type MaskableError interface {
        // Maskable makes implementer into MaskableError type
        Maskable()
}

// InvalidParameterError is an interface for errors originated by a bad request
type InvalidParameterError = errdefs.ErrInvalidParameter

// NotFoundError is an interface for errors raised because a needed resource is not available
type NotFoundError = errdefs.ErrNotFound

// ForbiddenError is an interface for errors which denote a valid request that cannot be honored
type ForbiddenError = errdefs.ErrForbidden

// UnavailableError is an interface for errors returned when the required service is not available
type UnavailableError = errdefs.ErrUnavailable

// NotImplementedError is an interface for errors raised because of requested functionality is not yet implemented
type NotImplementedError = errdefs.ErrNotImplemented

// InternalError is an interface for errors raised because of an internal error
type InternalError interface {
        // Internal makes implementer into InternalError type
        Internal()
}

/******************************
 * Well-known Error Formatters
 ******************************/

// InvalidParameterErrorf creates an instance of InvalidParameterError
func InvalidParameterErrorf(format string, params ...interface{}) error {
        return errdefs.InvalidParameter(fmt.Errorf(format, params...))
}

// NotFoundErrorf creates an instance of NotFoundError
func NotFoundErrorf(format string, params ...interface{}) error {
        return errdefs.NotFound(fmt.Errorf(format, params...))
}

// ForbiddenErrorf creates an instance of ForbiddenError
func ForbiddenErrorf(format string, params ...interface{}) error {
        return errdefs.Forbidden(fmt.Errorf(format, params...))
}

// UnavailableErrorf creates an instance of UnavailableError
func UnavailableErrorf(format string, params ...interface{}) error {
        return errdefs.Unavailable(fmt.Errorf(format, params...))
}

// NotImplementedErrorf creates an instance of NotImplementedError
func NotImplementedErrorf(format string, params ...interface{}) error {
        return errdefs.NotImplemented(fmt.Errorf(format, params...))
}

// InternalErrorf creates an instance of InternalError
func InternalErrorf(format string, params ...interface{}) error {
        return internal(fmt.Sprintf(format, params...))
}

// InternalMaskableErrorf creates an instance of InternalError and MaskableError
func InternalMaskableErrorf(format string, params ...interface{}) error {
        return maskInternal(fmt.Sprintf(format, params...))
}

/***********************
 * Internal Error Types
 ***********************/

type internal string

func (nt internal) Error() string {
        return string(nt)
}
func (nt internal) Internal() {}

type maskInternal string

func (mnt maskInternal) Error() string {
        return string(mnt)
}
func (mnt maskInternal) Internal() {}
func (mnt maskInternal) Maskable() {}

package logger

import (
        "context"
        "io"
        "os"
        "path/filepath"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/moby/moby/api/types/plugins/logdriver"
        "github.com/pkg/errors"
)

// pluginAdapter takes a plugin and implements the Logger interface for logger
// instances
type pluginAdapter struct {
        driverName   string
        id           string
        plugin       logPlugin
        fifoPath     string
        capabilities Capability
        logInfo      Info

        // synchronize access to the log stream and shared buffer
        mu     sync.Mutex
        enc    logdriver.LogEntryEncoder
        stream io.WriteCloser
        // buf is shared for each `Log()` call to reduce allocations.
        // buf must be protected by mutex
        buf logdriver.LogEntry
}

func (a *pluginAdapter) Log(msg *Message) error {
        a.mu.Lock()

        a.buf.Line = msg.Line
        a.buf.TimeNano = msg.Timestamp.UnixNano()
        a.buf.Partial = msg.PLogMetaData != nil
        a.buf.Source = msg.Source
        if msg.PLogMetaData != nil {
                a.buf.PartialLogMetadata = &logdriver.PartialLogEntryMetadata{
                        Id:      msg.PLogMetaData.ID,
                        Last:    msg.PLogMetaData.Last,
                        Ordinal: int32(msg.PLogMetaData.Ordinal),
                }
        }

        err := a.enc.Encode(&a.buf)
        a.buf.Reset()

        a.mu.Unlock()

        PutMessage(msg)
        return err
}

func (a *pluginAdapter) Name() string {
        return a.driverName
}

func (a *pluginAdapter) Close() error {
        a.mu.Lock()
        defer a.mu.Unlock()

        if err := a.plugin.StopLogging(filepath.Join("/", "run", "docker", "logging", a.id)); err != nil {
                return err
        }

        if err := a.stream.Close(); err != nil {
                log.G(context.TODO()).WithError(err).Error("error closing plugin fifo")
        }
        if err := os.Remove(a.fifoPath); err != nil && !os.IsNotExist(err) {
                log.G(context.TODO()).WithError(err).Error("error cleaning up plugin fifo")
        }

        // may be nil, especially for unit tests
        if pluginGetter != nil {
                pluginGetter.Get(a.Name(), extName, plugingetter.Release)
        }
        return nil
}

type pluginAdapterWithRead struct {
        *pluginAdapter
}

func (a *pluginAdapterWithRead) ReadLogs(ctx context.Context, config ReadConfig) *LogWatcher {
        watcher := NewLogWatcher()

        go func() {
                defer close(watcher.Msg)
                stream, err := a.plugin.ReadLogs(a.logInfo, config)
                if err != nil {
                        watcher.Err <- errors.Wrap(err, "error getting log reader")
                        return
                }
                defer stream.Close()

                dec := logdriver.NewLogEntryDecoder(stream)
                for {
                        if ctx.Err() != nil {
                                return
                        }

                        var buf logdriver.LogEntry
                        if err := dec.Decode(&buf); err != nil {
                                if errors.Is(err, io.EOF) {
                                        return
                                }
                                watcher.Err <- errors.Wrap(err, "error decoding log message")
                                return
                        }

                        msg := &Message{
                                Timestamp: time.Unix(0, buf.TimeNano),
                                Line:      buf.Line,
                                Source:    buf.Source,
                        }

                        // plugin should handle this, but check just in case
                        if !config.Since.IsZero() && msg.Timestamp.Before(config.Since) {
                                continue
                        }
                        if !config.Until.IsZero() && msg.Timestamp.After(config.Until) {
                                return
                        }

                        // send the message unless the consumer is gone
                        select {
                        case watcher.Msg <- msg:
                        case <-ctx.Done():
                                return
                        case <-watcher.WatchConsumerGone():
                                return
                        }
                }
        }()

        return watcher
}

package logger

import (
        "bytes"
        "context"
        "io"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/stringid"
        types "github.com/moby/moby/api/types/backend"
)

const (
        // readSize is the maximum bytes read during a single read
        // operation.
        readSize = 2 * 1024

        // defaultBufSize provides a reasonable default for loggers that do
        // not have an external limit to impose on log line size.
        defaultBufSize = 16 * 1024
)

// Copier can copy logs from specified sources to Logger and attach Timestamp.
// Writes are concurrent, so you need implement some sync in your logger.
type Copier struct {
        // srcs is map of name -> reader pairs, for example "stdout", "stderr"
        srcs      map[string]io.Reader
        dst       Logger
        copyJobs  sync.WaitGroup
        closeOnce sync.Once
        closed    chan struct{}
}

// NewCopier creates a new Copier
func NewCopier(srcs map[string]io.Reader, dst Logger) *Copier {
        return &Copier{
                srcs:   srcs,
                dst:    dst,
                closed: make(chan struct{}),
        }
}

// Run starts logs copying
func (c *Copier) Run() {
        for src, w := range c.srcs {
                c.copyJobs.Add(1)
                go c.copySrc(src, w)
        }
}

func (c *Copier) copySrc(name string, src io.Reader) {
        defer c.copyJobs.Done()

        bufSize := defaultBufSize
        if sizedLogger, ok := c.dst.(SizedLogger); ok {
                size := sizedLogger.BufSize()
                // Loggers that wrap another loggers would have BufSize(), but cannot return the size
                // when the wrapped loggers doesn't have BufSize().
                if size > 0 {
                        bufSize = size
                }
        }
        buf := make([]byte, bufSize)

        n := 0
        eof := false
        var partialid string
        var partialTS time.Time
        var ordinal int
        firstPartial := true
        hasMorePartial := false

        for {
                select {
                case <-c.closed:
                        return
                default:
                        // Work out how much more data we are okay with reading this time.
                        upto := n + readSize
                        if upto > cap(buf) {
                                upto = cap(buf)
                        }
                        // Try to read that data.
                        if upto > n {
                                read, err := src.Read(buf[n:upto])
                                if err != nil {
                                        if err != io.EOF {
                                                logReadsFailedCount.Inc(1)
                                                log.G(context.TODO()).Errorf("Error scanning log stream: %s", err)
                                                return
                                        }
                                        eof = true
                                }
                                n += read
                        }
                        // If we have no data to log, and there's no more coming, we're done.
                        if n == 0 && eof {
                                return
                        }
                        // Break up the data that we've buffered up into lines, and log each in turn.
                        p := 0

                        for q := bytes.IndexByte(buf[p:n], '\n'); q >= 0; q = bytes.IndexByte(buf[p:n], '\n') {
                                select {
                                case <-c.closed:
                                        return
                                default:
                                        msg := NewMessage()
                                        msg.Source = name
                                        msg.Line = append(msg.Line, buf[p:p+q]...)

                                        if hasMorePartial {
                                                msg.PLogMetaData = &types.PartialLogMetaData{ID: partialid, Ordinal: ordinal, Last: true}

                                                // reset
                                                partialid = ""
                                                ordinal = 0
                                                firstPartial = true
                                                hasMorePartial = false
                                        }
                                        if msg.PLogMetaData == nil {
                                                msg.Timestamp = time.Now().UTC()
                                        } else {
                                                msg.Timestamp = partialTS
                                        }

                                        if logErr := c.dst.Log(msg); logErr != nil {
                                                logDriverError(c.dst.Name(), string(msg.Line), logErr)
                                        }
                                }
                                p += q + 1
                        }
                        // If there's no more coming, or the buffer is full but
                        // has no newlines, log whatever we haven't logged yet,
                        // noting that it's a partial log line.
                        if eof || (p == 0 && n == len(buf)) {
                                if p < n {
                                        msg := NewMessage()
                                        msg.Source = name
                                        msg.Line = append(msg.Line, buf[p:n]...)

                                        // Generate unique partialID for first partial. Use it across partials.
                                        // Record timestamp for first partial. Use it across partials.
                                        // Initialize Ordinal for first partial. Increment it across partials.
                                        if firstPartial {
                                                msg.Timestamp = time.Now().UTC()
                                                partialTS = msg.Timestamp
                                                partialid = stringid.GenerateRandomID()
                                                ordinal = 1
                                                firstPartial = false
                                                totalPartialLogs.Inc(1)
                                        } else {
                                                msg.Timestamp = partialTS
                                        }
                                        msg.PLogMetaData = &types.PartialLogMetaData{ID: partialid, Ordinal: ordinal, Last: false}
                                        ordinal++
                                        hasMorePartial = true

                                        if logErr := c.dst.Log(msg); logErr != nil {
                                                logDriverError(c.dst.Name(), string(msg.Line), logErr)
                                        }
                                        p = 0
                                        n = 0
                                }
                                if eof {
                                        return
                                }
                        }
                        // Move any unlogged data to the front of the buffer in preparation for another read.
                        if p > 0 {
                                copy(buf[0:], buf[p:n])
                                n -= p
                        }
                }
        }
}

// Wait waits until all copying is done
func (c *Copier) Wait() {
        c.copyJobs.Wait()
}

// Close closes the copier
func (c *Copier) Close() {
        c.closeOnce.Do(func() {
                close(c.closed)
        })
}

package logger

import (
        "fmt"
        "sort"
        "sync"

        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/go-units"
        containertypes "github.com/moby/moby/api/types/container"
        "github.com/pkg/errors"
)

// Creator builds a logging driver instance with given context.
type Creator func(Info) (Logger, error)

// LogOptValidator checks the options specific to the underlying
// logging implementation.
type LogOptValidator func(cfg map[string]string) error

type logdriverFactory struct {
        registry     map[string]Creator
        optValidator map[string]LogOptValidator
        m            sync.Mutex
}

func (lf *logdriverFactory) list() []string {
        ls := make([]string, 0, len(lf.registry))
        lf.m.Lock()
        for name := range lf.registry {
                ls = append(ls, name)
        }
        lf.m.Unlock()
        sort.Strings(ls)
        return ls
}

// ListDrivers gets the list of registered log driver names
func ListDrivers() []string {
        return factory.list()
}

func (lf *logdriverFactory) register(name string, c Creator) error {
        registered, err := lf.driverRegistered(name)
        if err != nil {
                return err
        }
        if registered {
                return fmt.Errorf("logger: log driver named '%s' is already registered", name)
        }

        lf.m.Lock()
        lf.registry[name] = c
        lf.m.Unlock()
        return nil
}

func (lf *logdriverFactory) driverRegistered(name string) (bool, error) {
        lf.m.Lock()
        _, ok := lf.registry[name]
        lf.m.Unlock()
        if !ok {
                if pluginGetter != nil { // this can be nil when the init functions are running
                        l, err := getPlugin(name, plugingetter.Lookup)
                        if err != nil {
                                return false, err
                        }
                        if l != nil {
                                return true, nil
                        }
                }
        }
        return ok, nil
}

func (lf *logdriverFactory) registerLogOptValidator(name string, l LogOptValidator) error {
        lf.m.Lock()
        defer lf.m.Unlock()

        if _, ok := lf.optValidator[name]; ok {
                return fmt.Errorf("logger: log validator named '%s' is already registered", name)
        }
        lf.optValidator[name] = l
        return nil
}

func (lf *logdriverFactory) get(name string) (Creator, error) {
        lf.m.Lock()
        defer lf.m.Unlock()

        c, ok := lf.registry[name]
        if ok {
                return c, nil
        }

        c, err := getPlugin(name, plugingetter.Acquire)
        return c, errors.Wrapf(err, "logger: no log driver named '%s' is registered", name)
}

func (lf *logdriverFactory) getLogOptValidator(name string) LogOptValidator {
        lf.m.Lock()
        defer lf.m.Unlock()

        c := lf.optValidator[name]
        return c
}

var factory = &logdriverFactory{registry: make(map[string]Creator), optValidator: make(map[string]LogOptValidator)} // global factory instance

// RegisterLogDriver registers the given logging driver builder with given logging
// driver name.
func RegisterLogDriver(name string, c Creator) error {
        return factory.register(name, c)
}

// RegisterLogOptValidator registers the logging option validator with
// the given logging driver name.
func RegisterLogOptValidator(name string, l LogOptValidator) error {
        return factory.registerLogOptValidator(name, l)
}

// GetLogDriver provides the logging driver builder for a logging driver name.
func GetLogDriver(name string) (Creator, error) {
        return factory.get(name)
}

var builtInLogOpts = map[string]bool{
        "mode":            true,
        "max-buffer-size": true,
}

// ValidateLogOpts checks the options for the given log driver. The
// options supported are specific to the LogDriver implementation.
func ValidateLogOpts(name string, cfg map[string]string) error {
        if name == "none" {
                return nil
        }

        switch containertypes.LogMode(cfg["mode"]) {
        case containertypes.LogModeBlocking, containertypes.LogModeNonBlock, containertypes.LogModeUnset:
        default:
                return fmt.Errorf("logger: logging mode not supported: %s", cfg["mode"])
        }

        if s, ok := cfg["max-buffer-size"]; ok {
                if containertypes.LogMode(cfg["mode"]) != containertypes.LogModeNonBlock {
                        return fmt.Errorf("logger: max-buffer-size option is only supported with 'mode=%s'", containertypes.LogModeNonBlock)
                }
                if _, err := units.RAMInBytes(s); err != nil {
                        return errors.Wrap(err, "error parsing option max-buffer-size")
                }
        }

        if err := validateExternal(cfg); err != nil {
                return err
        }

        registered, err := factory.driverRegistered(name)
        if err != nil {
                return err
        }
        if !registered {
                return fmt.Errorf("logger: no log driver named '%s' is registered", name)
        }

        filteredOpts := make(map[string]string, len(builtInLogOpts))
        for k, v := range cfg {
                if !builtInLogOpts[k] {
                        filteredOpts[k] = v
                }
        }

        validator := factory.getLogOptValidator(name)
        if validator != nil {
                return validator(filteredOpts)
        }
        return nil
}

// Package jsonfilelog provides the default Logger implementation for
// Docker logging. This logger logs to files on the host server in the
// JSON format.
package jsonfilelog

import (
        "bytes"
        "encoding/json"
        "fmt"
        "strconv"
        "sync"

        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/jsonfilelog/jsonlog"
        "github.com/docker/docker/daemon/logger/loggerutils"
        "github.com/docker/go-units"
        "github.com/pkg/errors"
)

// Name is the name of the file that the jsonlogger logs to.
const Name = "json-file"

// Every buffer will have to store the same constant json structure with the message
// len(`{"log":"","stream:"stdout","time":"2000-01-01T00:00:00.000000000Z"}\n`) = 68.
// So let's start with a buffer bigger than this.
const initialBufSize = 256

var buffersPool = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 0, initialBufSize)) }}

// JSONFileLogger is Logger implementation for default Docker logging.
type JSONFileLogger struct {
        writer *loggerutils.LogFile
        tag    string // tag values requested by the user to log
        extra  json.RawMessage
}

func init() {
        if err := logger.RegisterLogDriver(Name, New); err != nil {
                panic(err)
        }
        if err := logger.RegisterLogOptValidator(Name, ValidateLogOpt); err != nil {
                panic(err)
        }
}

// New creates new JSONFileLogger which writes to filename passed in
// on given context.
func New(info logger.Info) (logger.Logger, error) {
        var capval int64 = -1
        if capacity, ok := info.Config["max-size"]; ok {
                var err error
                capval, err = units.FromHumanSize(capacity)
                if err != nil {
                        return nil, err
                }
                if capval <= 0 {
                        return nil, errors.New("max-size must be a positive number")
                }
        }
        maxFiles := 1
        if maxFileString, ok := info.Config["max-file"]; ok {
                var err error
                maxFiles, err = strconv.Atoi(maxFileString)
                if err != nil {
                        return nil, err
                }
                if maxFiles < 1 {
                        return nil, errors.New("max-file cannot be less than 1")
                }
        }

        var compress bool
        if compressString, ok := info.Config["compress"]; ok {
                var err error
                compress, err = strconv.ParseBool(compressString)
                if err != nil {
                        return nil, err
                }
                if compress && (maxFiles == 1 || capval == -1) {
                        return nil, errors.New("compress cannot be true when max-file is less than 2 or max-size is not set")
                }
        }

        extraAttrs, err := info.ExtraAttributes(nil)
        if err != nil {
                return nil, err
        }

        // no default template. only use a tag if the user asked for it
        tag, err := loggerutils.ParseLogTag(info, "")
        if err != nil {
                return nil, err
        }
        if tag != "" {
                extraAttrs["tag"] = tag
        }

        var extra json.RawMessage
        if len(extraAttrs) > 0 {
                var err error
                extra, err = json.Marshal(extraAttrs)
                if err != nil {
                        return nil, err
                }
        }

        writer, err := loggerutils.NewLogFile(info.LogPath, capval, maxFiles, compress, decodeFunc, 0o640, getTailReader)
        if err != nil {
                return nil, err
        }

        return &JSONFileLogger{
                writer: writer,
                tag:    tag,
                extra:  extra,
        }, nil
}

// Log converts logger.Message to jsonlog.JSONLog and serializes it to file.
func (l *JSONFileLogger) Log(msg *logger.Message) error {
        buf := buffersPool.Get().(*bytes.Buffer)
        buf.Reset()
        defer buffersPool.Put(buf)

        timestamp := msg.Timestamp
        err := marshalMessage(msg, l.extra, buf)
        logger.PutMessage(msg)

        if err != nil {
                return err
        }

        return l.writer.WriteLogEntry(timestamp, buf.Bytes())
}

func marshalMessage(msg *logger.Message, extra json.RawMessage, buf *bytes.Buffer) error {
        logLine := msg.Line
        if msg.PLogMetaData == nil || (msg.PLogMetaData != nil && msg.PLogMetaData.Last) {
                logLine = append(msg.Line, '\n')
        }
        err := (&jsonlog.JSONLogs{
                Log:      logLine,
                Stream:   msg.Source,
                Created:  msg.Timestamp,
                RawAttrs: extra,
        }).MarshalJSONBuf(buf)
        if err != nil {
                return errors.Wrap(err, "error writing log message to buffer")
        }
        err = buf.WriteByte('\n')
        return errors.Wrap(err, "error finalizing log buffer")
}

// ValidateLogOpt looks for json specific log options max-file & max-size.
func ValidateLogOpt(cfg map[string]string) error {
        for key := range cfg {
                switch key {
                case "max-file":
                case "max-size":
                case "compress":
                case "labels":
                case "labels-regex":
                case "env":
                case "env-regex":
                case "tag":
                default:
                        return fmt.Errorf("unknown log opt '%s' for json-file log driver", key)
                }
        }
        return nil
}

// Close closes underlying file and signals all the readers
// that the logs producer is gone.
func (l *JSONFileLogger) Close() error {
        return l.writer.Close()
}

// Name returns name of this logger.
func (l *JSONFileLogger) Name() string {
        return Name
}

package jsonlog

import (
        "time"
)

// JSONLog is a log message, typically a single entry from a given log stream.
type JSONLog struct {
        // Log is the log message
        Log string `json:"log,omitempty"`
        // Stream is the log source
        Stream string `json:"stream,omitempty"`
        // Created is the created timestamp of log
        Created time.Time `json:"time"`
        // Attrs is the list of extra attributes provided by the user
        Attrs map[string]string `json:"attrs,omitempty"`
}

// Reset all fields to their zero value.
func (jl *JSONLog) Reset() {
        jl.Log = ""
        jl.Stream = ""
        jl.Created = time.Time{}
        for k := range jl.Attrs {
                delete(jl.Attrs, k)
        }
}

package jsonlog

import (
        "bytes"
        "encoding/json"
        "time"
        "unicode/utf8"
)

// JSONLogs marshals encoded JSONLog objects
type JSONLogs struct {
        Log     []byte    `json:"log,omitempty"`
        Stream  string    `json:"stream,omitempty"`
        Created time.Time `json:"time"`

        // json-encoded bytes
        RawAttrs json.RawMessage `json:"attrs,omitempty"`
}

// MarshalJSONBuf is an optimized JSON marshaller that avoids reflection
// and unnecessary allocation.
func (mj *JSONLogs) MarshalJSONBuf(buf *bytes.Buffer) error {
        first := true

        buf.WriteString(`{`)
        if len(mj.Log) != 0 {
                first = false
                buf.WriteString(`"log":`)
                ffjsonWriteJSONBytesAsString(buf, mj.Log)
        }
        if mj.Stream != "" {
                if first {
                        first = false
                } else {
                        buf.WriteString(`,`)
                }
                buf.WriteString(`"stream":`)
                ffjsonWriteJSONBytesAsString(buf, []byte(mj.Stream))
        }
        if len(mj.RawAttrs) > 0 {
                if first {
                        first = false
                } else {
                        buf.WriteString(`,`)
                }
                buf.WriteString(`"attrs":`)
                buf.Write(mj.RawAttrs)
        }
        if !first {
                buf.WriteString(`,`)
        }

        created, err := fastTimeMarshalJSON(mj.Created)
        if err != nil {
                return err
        }

        buf.WriteString(`"time":`)
        buf.WriteString(created)
        buf.WriteString(`}`)
        return nil
}

func ffjsonWriteJSONBytesAsString(buf *bytes.Buffer, s []byte) {
        const hex = "0123456789abcdef"

        buf.WriteByte('"')
        start := 0
        for i := 0; i < len(s); {
                if b := s[i]; b < utf8.RuneSelf {
                        if 0x20 <= b && b != '\\' && b != '"' && b != '<' && b != '>' && b != '&' {
                                i++
                                continue
                        }
                        if start < i {
                                buf.Write(s[start:i])
                        }
                        switch b {
                        case '\\', '"':
                                buf.WriteByte('\\')
                                buf.WriteByte(b)
                        case '\n':
                                buf.WriteByte('\\')
                                buf.WriteByte('n')
                        case '\r':
                                buf.WriteByte('\\')
                                buf.WriteByte('r')
                        default:

                                buf.WriteString(`\u00`)
                                buf.WriteByte(hex[b>>4])
                                buf.WriteByte(hex[b&0xF])
                        }
                        i++
                        start = i
                        continue
                }
                c, size := utf8.DecodeRune(s[i:])
                if c == utf8.RuneError && size == 1 {
                        if start < i {
                                buf.Write(s[start:i])
                        }
                        buf.WriteString(`\ufffd`)
                        i += size
                        start = i
                        continue
                }

                if c == '\u2028' || c == '\u2029' {
                        if start < i {
                                buf.Write(s[start:i])
                        }
                        buf.WriteString(`\u202`)
                        buf.WriteByte(hex[c&0xF])
                        i += size
                        start = i
                        continue
                }
                i += size
        }
        if start < len(s) {
                buf.Write(s[start:])
        }
        buf.WriteByte('"')
}

package jsonlog

import (
        "time"

        "github.com/pkg/errors"
)

const jsonFormat = `"` + time.RFC3339Nano + `"`

// fastTimeMarshalJSON avoids one of the extra allocations that
// time.MarshalJSON is making.
func fastTimeMarshalJSON(t time.Time) (string, error) {
        if y := t.Year(); y < 0 || y >= 10000 {
                // RFC 3339 is clear that years are 4 digits exactly.
                // See golang.org/issue/4556#c15 for more discussion.
                return "", errors.New("time.MarshalJSON: year outside of range [0,9999]")
        }
        return t.Format(jsonFormat), nil
}

package jsonfilelog

import (
        "context"
        "encoding/json"
        "io"

        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/jsonfilelog/jsonlog"
        "github.com/docker/docker/daemon/logger/loggerutils"
        "github.com/docker/docker/pkg/tailfile"
        "github.com/moby/moby/api/types/backend"
)

var _ logger.LogReader = (*JSONFileLogger)(nil)

// ReadLogs implements the logger's LogReader interface for the logs
// created by this driver.
func (l *JSONFileLogger) ReadLogs(ctx context.Context, config logger.ReadConfig) *logger.LogWatcher {
        return l.writer.ReadLogs(ctx, config)
}

func decodeLogLine(dec *json.Decoder, l *jsonlog.JSONLog) (*logger.Message, error) {
        l.Reset()
        if err := dec.Decode(l); err != nil {
                return nil, err
        }

        var attrs []backend.LogAttr
        if len(l.Attrs) != 0 {
                attrs = make([]backend.LogAttr, 0, len(l.Attrs))
                for k, v := range l.Attrs {
                        attrs = append(attrs, backend.LogAttr{Key: k, Value: v})
                }
        }
        msg := &logger.Message{
                Source:    l.Stream,
                Timestamp: l.Created,
                Line:      []byte(l.Log),
                Attrs:     attrs,
        }
        return msg, nil
}

type decoder struct {
        rdr io.Reader
        dec *json.Decoder
        jl  *jsonlog.JSONLog
}

func (d *decoder) Reset(rdr io.Reader) {
        d.rdr = rdr
        d.dec = nil
        if d.jl != nil {
                d.jl.Reset()
        }
}

func (d *decoder) Close() {
        d.dec = nil
        d.rdr = nil
        d.jl = nil
}

func (d *decoder) Decode() (*logger.Message, error) {
        if d.dec == nil {
                d.dec = json.NewDecoder(d.rdr)
        }
        if d.jl == nil {
                d.jl = &jsonlog.JSONLog{}
        }
        return decodeLogLine(d.dec, d.jl)
}

// decodeFunc is used to create a decoder for the log file reader
func decodeFunc(rdr io.Reader) loggerutils.Decoder {
        return &decoder{
                rdr: rdr,
                dec: nil,
                jl:  nil,
        }
}

func getTailReader(ctx context.Context, r loggerutils.SizeReaderAt, req int) (loggerutils.SizeReaderAt, int, error) {
        return tailfile.NewTailReader(ctx, r, req)
}

package local

import (
        "github.com/pkg/errors"
)

// CreateConfig is used to configure new instances of driver
type CreateConfig struct {
        DisableCompression bool
        MaxFileSize        int64
        MaxFileCount       int
}

func newDefaultConfig() *CreateConfig {
        return &CreateConfig{
                MaxFileSize:        defaultMaxFileSize,
                MaxFileCount:       defaultMaxFileCount,
                DisableCompression: !defaultCompressLogs,
        }
}

func validateConfig(cfg *CreateConfig) error {
        if cfg.MaxFileSize < 0 {
                return errors.New("max size should be a positive number")
        }
        if cfg.MaxFileCount < 0 {
                return errors.New("max file count cannot be less than 0")
        }

        if !cfg.DisableCompression {
                if cfg.MaxFileCount <= 1 {
                        return errors.New("compression cannot be enabled when max file count is 1")
                }
        }
        return nil
}

package local

import (
        "encoding/binary"
        "io"
        "math/bits"
        "strconv"
        "sync"
        "time"

        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/loggerutils"
        "github.com/docker/docker/errdefs"
        "github.com/docker/go-units"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/plugins/logdriver"
        "github.com/pkg/errors"
)

const (
        // Name is the name of the driver
        Name = "local"

        encodeBinaryLen = 4
        initialBufSize  = 2048
        maxDecodeRetry  = 20000

        defaultMaxFileSize  int64 = 20 * 1024 * 1024
        defaultMaxFileCount       = 5
        defaultCompressLogs       = true
)

var buffersPool = sync.Pool{New: func() interface{} {
        b := make([]byte, initialBufSize)
        return &b
}}

// LogOptKeys are the keys names used for log opts passed in to initialize the driver.
var LogOptKeys = map[string]bool{
        "max-file": true,
        "max-size": true,
        "compress": true,
}

// ValidateLogOpt looks for log driver specific options.
func ValidateLogOpt(cfg map[string]string) error {
        for key := range cfg {
                if !LogOptKeys[key] {
                        return errors.Errorf("unknown log opt '%s' for log driver %s", key, Name)
                }
        }
        return nil
}

func init() {
        if err := logger.RegisterLogDriver(Name, New); err != nil {
                panic(err)
        }
        if err := logger.RegisterLogOptValidator(Name, ValidateLogOpt); err != nil {
                panic(err)
        }
}

type driver struct {
        logfile *loggerutils.LogFile
}

// New creates a new local logger
// You must provide the `LogPath` in the passed in info argument, this is the file path that logs are written to.
func New(info logger.Info) (logger.Logger, error) {
        if info.LogPath == "" {
                return nil, errdefs.System(errors.New("log path is missing -- this is a bug and should not happen"))
        }

        cfg := newDefaultConfig()
        if capacity, ok := info.Config["max-size"]; ok {
                var err error
                cfg.MaxFileSize, err = units.FromHumanSize(capacity)
                if err != nil {
                        return nil, errdefs.InvalidParameter(errors.Wrapf(err, "invalid value for max-size: %s", capacity))
                }
        }

        if userMaxFileCount, ok := info.Config["max-file"]; ok {
                var err error
                cfg.MaxFileCount, err = strconv.Atoi(userMaxFileCount)
                if err != nil {
                        return nil, errdefs.InvalidParameter(errors.Wrapf(err, "invalid value for max-file: %s", userMaxFileCount))
                }
        }

        if userCompress, ok := info.Config["compress"]; ok {
                compressLogs, err := strconv.ParseBool(userCompress)
                if err != nil {
                        return nil, errdefs.InvalidParameter(errors.Wrap(err, "error reading compress log option"))
                }
                cfg.DisableCompression = !compressLogs
        }
        return newDriver(info.LogPath, cfg)
}

func marshal(m *logger.Message, buffer *[]byte) error {
        proto := logdriver.LogEntry{}
        md := logdriver.PartialLogEntryMetadata{}

        resetProto(&proto)

        messageToProto(m, &proto, &md)
        protoSize := proto.Size()
        writeLen := protoSize + (2 * encodeBinaryLen) // + len(messageDelimiter)

        buf := *buffer
        if writeLen > cap(buf) {
                // If we already need to reallocate the buffer, make it larger to be more reusable.
                // Round to the next power of two.
                capacity := 1 << (bits.Len(uint(writeLen)) + 1)

                buf = make([]byte, writeLen, capacity)
        } else {
                buf = buf[:writeLen]
        }
        *buffer = buf

        binary.BigEndian.PutUint32(buf[:encodeBinaryLen], uint32(protoSize))
        n, err := proto.MarshalTo(buf[encodeBinaryLen:writeLen])
        if err != nil {
                return errors.Wrap(err, "error marshaling log entry")
        }
        if n+(encodeBinaryLen*2) != writeLen {
                return io.ErrShortWrite
        }
        binary.BigEndian.PutUint32(buf[writeLen-encodeBinaryLen:writeLen], uint32(protoSize))
        return nil
}

func newDriver(logPath string, cfg *CreateConfig) (logger.Logger, error) {
        if err := validateConfig(cfg); err != nil {
                return nil, errdefs.InvalidParameter(err)
        }

        lf, err := loggerutils.NewLogFile(logPath, cfg.MaxFileSize, cfg.MaxFileCount, !cfg.DisableCompression, decodeFunc, 0o640, getTailReader)
        if err != nil {
                return nil, err
        }
        return &driver{
                logfile: lf,
        }, nil
}

func (d *driver) Name() string {
        return Name
}

func (d *driver) Log(msg *logger.Message) error {
        buf := buffersPool.Get().(*[]byte)
        defer buffersPool.Put(buf)

        timestamp := msg.Timestamp
        err := marshal(msg, buf)
        logger.PutMessage(msg)

        if err != nil {
                return errors.Wrap(err, "error marshalling logger.Message")
        }
        return d.logfile.WriteLogEntry(timestamp, *buf)
}

func (d *driver) Close() error {
        return d.logfile.Close()
}

func messageToProto(msg *logger.Message, proto *logdriver.LogEntry, partial *logdriver.PartialLogEntryMetadata) {
        proto.Source = msg.Source
        proto.TimeNano = msg.Timestamp.UnixNano()
        proto.Line = append(proto.Line[:0], msg.Line...)
        proto.Partial = msg.PLogMetaData != nil
        if proto.Partial {
                partial.Ordinal = int32(msg.PLogMetaData.Ordinal)
                partial.Last = msg.PLogMetaData.Last
                partial.Id = msg.PLogMetaData.ID
                proto.PartialLogMetadata = partial
        } else {
                proto.PartialLogMetadata = nil
        }
}

func protoToMessage(proto *logdriver.LogEntry) *logger.Message {
        msg := &logger.Message{
                Source:    proto.Source,
                Timestamp: time.Unix(0, proto.TimeNano).UTC(),
        }
        if proto.Partial {
                var md backend.PartialLogMetaData
                md.Last = proto.GetPartialLogMetadata().GetLast()
                md.ID = proto.GetPartialLogMetadata().GetId()
                md.Ordinal = int(proto.GetPartialLogMetadata().GetOrdinal())
                msg.PLogMetaData = &md
        }
        msg.Line = append(msg.Line[:0], proto.Line...)
        return msg
}

func resetProto(proto *logdriver.LogEntry) {
        proto.Source = ""
        proto.Line = proto.Line[:0]
        proto.TimeNano = 0
        proto.Partial = false
        if proto.PartialLogMetadata != nil {
                proto.PartialLogMetadata.Id = ""
                proto.PartialLogMetadata.Last = false
                proto.PartialLogMetadata.Ordinal = 0
        }
        proto.PartialLogMetadata = nil
}

package local

import (
        "bytes"
        "context"
        "encoding/binary"
        "fmt"
        "io"

        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/loggerutils"
        "github.com/docker/docker/errdefs"
        "github.com/moby/moby/api/types/plugins/logdriver"
        "github.com/pkg/errors"
)

// maxMsgLen is the maximum size of the logger.Message after serialization.
// logger.defaultBufSize caps the size of Line field.
const maxMsgLen int = 1e6 // 1MB.

func (d *driver) ReadLogs(ctx context.Context, config logger.ReadConfig) *logger.LogWatcher {
        return d.logfile.ReadLogs(ctx, config)
}

func getTailReader(ctx context.Context, r loggerutils.SizeReaderAt, req int) (loggerutils.SizeReaderAt, int, error) {
        size := r.Size()
        if req < 0 {
                return nil, 0, errdefs.InvalidParameter(errors.Errorf("invalid number of lines to tail: %d", req))
        }

        if size < (encodeBinaryLen*2)+1 {
                return bytes.NewReader(nil), 0, nil
        }

        const encodeBinaryLen64 = int64(encodeBinaryLen)
        var found int

        buf := make([]byte, encodeBinaryLen)

        offset := size
        for {
                select {
                case <-ctx.Done():
                        return nil, 0, ctx.Err()
                default:
                }

                n, err := r.ReadAt(buf, offset-encodeBinaryLen64)
                if err != nil && !errors.Is(err, io.EOF) {
                        return nil, 0, errors.Wrap(err, "error reading log message footer")
                }

                if n != encodeBinaryLen {
                        return nil, 0, errdefs.DataLoss(errors.New("unexpected number of bytes read from log message footer"))
                }

                msgLen := binary.BigEndian.Uint32(buf)

                n, err = r.ReadAt(buf, offset-encodeBinaryLen64-encodeBinaryLen64-int64(msgLen))
                if err != nil && !errors.Is(err, io.EOF) {
                        return nil, 0, errors.Wrap(err, "error reading log message header")
                }

                if n != encodeBinaryLen {
                        return nil, 0, errdefs.DataLoss(errors.New("unexpected number of bytes read from log message header"))
                }

                if msgLen != binary.BigEndian.Uint32(buf) {
                        return nil, 0, errdefs.DataLoss(errors.New("log message header and footer indicate different message sizes"))
                }

                found++
                offset -= int64(msgLen)
                offset -= encodeBinaryLen64 * 2
                if found == req {
                        break
                }
                if offset <= 0 {
                        break
                }
        }

        return io.NewSectionReader(r, offset, size), found, nil
}

type decoder struct {
        rdr   io.Reader
        proto *logdriver.LogEntry
        // buf keeps bytes from rdr.
        buf []byte
        // offset is the position in buf.
        // If offset > 0, buf[offset:] has bytes which are read but haven't used.
        offset int
        // nextMsgLen is the length of the next log message.
        // If nextMsgLen = 0, a new value must be read from rdr.
        nextMsgLen int
}

func (d *decoder) readRecord(size int) error {
        var err error
        for i := 0; i < maxDecodeRetry; i++ {
                var n int
                n, err = io.ReadFull(d.rdr, d.buf[d.offset:size])
                d.offset += n
                if err != nil {
                        if err != io.ErrUnexpectedEOF {
                                return err
                        }
                        continue
                }
                break
        }
        if err != nil {
                return err
        }
        d.offset = 0
        return nil
}

func (d *decoder) Decode() (*logger.Message, error) {
        if d.proto == nil {
                d.proto = &logdriver.LogEntry{}
        } else {
                resetProto(d.proto)
        }
        if d.buf == nil {
                d.buf = make([]byte, initialBufSize)
        }

        if d.nextMsgLen == 0 {
                msgLen, err := d.decodeSizeHeader()
                if err != nil {
                        return nil, err
                }

                if msgLen > maxMsgLen {
                        return nil, fmt.Errorf("log message is too large (%d > %d)", msgLen, maxMsgLen)
                }

                if len(d.buf) < msgLen+encodeBinaryLen {
                        d.buf = make([]byte, msgLen+encodeBinaryLen)
                } else if msgLen <= initialBufSize {
                        d.buf = d.buf[:initialBufSize]
                } else {
                        d.buf = d.buf[:msgLen+encodeBinaryLen]
                }

                d.nextMsgLen = msgLen
        }
        return d.decodeLogEntry()
}

func (d *decoder) Reset(rdr io.Reader) {
        if d.rdr == rdr {
                return
        }

        d.rdr = rdr
        if d.proto != nil {
                resetProto(d.proto)
        }
        if d.buf != nil {
                d.buf = d.buf[:initialBufSize]
        }
        d.offset = 0
        d.nextMsgLen = 0
}

func (d *decoder) Close() {
        d.buf = d.buf[:0]
        d.buf = nil
        if d.proto != nil {
                resetProto(d.proto)
        }
        d.rdr = nil
}

func decodeFunc(rdr io.Reader) loggerutils.Decoder {
        return &decoder{rdr: rdr}
}

func (d *decoder) decodeSizeHeader() (int, error) {
        err := d.readRecord(encodeBinaryLen)
        if err != nil {
                return 0, errors.Wrap(err, "could not read a size header")
        }

        msgLen := int(binary.BigEndian.Uint32(d.buf[:encodeBinaryLen]))
        return msgLen, nil
}

func (d *decoder) decodeLogEntry() (*logger.Message, error) {
        msgLen := d.nextMsgLen
        err := d.readRecord(msgLen + encodeBinaryLen)
        if err != nil {
                return nil, errors.Wrapf(err, "could not read a log entry (size=%d+%d)", msgLen, encodeBinaryLen)
        }
        d.nextMsgLen = 0

        if err := d.proto.Unmarshal(d.buf[:msgLen]); err != nil {
                return nil, errors.Wrapf(err, "error unmarshalling log entry (size=%d)", msgLen)
        }

        msg := protoToMessage(d.proto)
        if msg.PLogMetaData == nil || msg.PLogMetaData.Last {
                msg.Line = append(msg.Line, '\n')
        }

        return msg, nil
}

package logger

var externalValidators []LogOptValidator

// RegisterExternalValidator adds the validator to the list of external validators.
// External validators are used by packages outside this package that need to add their own validation logic.
// This should only be called on package initialization.
func RegisterExternalValidator(v LogOptValidator) {
        externalValidators = append(externalValidators, v)
}

// AddBuiltinLogOpts updates the list of built-in log opts. This allows other packages to supplement additional log options
// without having to register an actual log driver. This is used by things that are more proxy log drivers and should
// not be exposed as a usable log driver to the API.
// This should only be called on package initialization.
func AddBuiltinLogOpts(opts map[string]bool) {
        for k, v := range opts {
                builtInLogOpts[k] = v
        }
}

func validateExternal(cfg map[string]string) error {
        for _, v := range externalValidators {
                if err := v(cfg); err != nil {
                        return err
                }
        }
        return nil
}

// Package logger defines interfaces that logger drivers implement to
// log messages.
//
// The other half of a logger driver is the implementation of the
// factory, which holds the contextual instance information that
// allows multiple loggers of the same type to perform different
// actions, such as logging to different locations.
package logger

import (
        "context"
        "sync"
        "time"

        "github.com/moby/moby/api/types/backend"
)

// ErrReadLogsNotSupported is returned when the underlying log driver does not support reading
type ErrReadLogsNotSupported struct{}

func (ErrReadLogsNotSupported) Error() string {
        return "configured logging driver does not support reading"
}

// NotImplemented makes this error implement the `NotImplemented` interface from api/errdefs
func (ErrReadLogsNotSupported) NotImplemented() {}

const (
        logWatcherBufferSize = 4096
)

var messagePool = &sync.Pool{New: func() interface{} { return &Message{Line: make([]byte, 0, 256)} }}

// NewMessage returns a new message from the message sync.Pool
func NewMessage() *Message {
        return messagePool.Get().(*Message)
}

// PutMessage puts the specified message back n the message pool.
// The message fields are reset before putting into the pool.
func PutMessage(msg *Message) {
        msg.reset()
        messagePool.Put(msg)
}

// Message is data structure that represents piece of output produced by some
// container.  The Line member is a slice of an array whose contents can be
// changed after a log driver's Log() method returns.
//
// Message is subtyped from backend.LogMessage because there is a lot of
// internal complexity around the Message type that should not be exposed
// to any package not explicitly importing the logger type.
type Message backend.LogMessage

// reset sets the message back to default values
// This is used when putting a message back into the message pool.
func (m *Message) reset() {
        *m = Message{Line: m.Line[:0]}
}

// AsLogMessage returns a pointer to the message as a pointer to
// backend.LogMessage, which is an identical type with a different purpose
func (m *Message) AsLogMessage() *backend.LogMessage {
        return (*backend.LogMessage)(m)
}

// Logger is the interface for docker logging drivers.
type Logger interface {
        Log(*Message) error
        Name() string
        Close() error
}

// SizedLogger is the interface for logging drivers that can control
// the size of buffer used for their messages.
type SizedLogger interface {
        Logger
        BufSize() int
}

// ReadConfig is the configuration passed into ReadLogs.
type ReadConfig struct {
        Since  time.Time
        Until  time.Time
        Tail   int
        Follow bool
}

// LogReader is the interface for reading log messages for loggers that support reading.
type LogReader interface {
        // ReadLogs reads logs from underlying logging backend.
        ReadLogs(context.Context, ReadConfig) *LogWatcher
}

// LogWatcher is used when consuming logs read from the LogReader interface.
type LogWatcher struct {
        // For sending log messages to a reader.
        Msg chan *Message
        // For sending error messages that occur while reading logs.
        Err          chan error
        consumerOnce sync.Once
        consumerGone chan struct{}
}

// NewLogWatcher returns a new LogWatcher.
func NewLogWatcher() *LogWatcher {
        return &LogWatcher{
                Msg:          make(chan *Message, logWatcherBufferSize),
                Err:          make(chan error, 1),
                consumerGone: make(chan struct{}),
        }
}

// ConsumerGone notifies that the logs consumer is gone.
func (w *LogWatcher) ConsumerGone() {
        // only close if not already closed
        w.consumerOnce.Do(func() {
                close(w.consumerGone)
        })
}

// WatchConsumerGone returns a channel receiver that receives notification
// when the log watcher consumer is gone.
func (w *LogWatcher) WatchConsumerGone() <-chan struct{} {
        return w.consumerGone
}

// Capability defines the list of capabilities that a driver can implement
// These capabilities are not required to be a logging driver, however do
// determine how a logging driver can be used
type Capability struct {
        // Determines if a log driver can read back logs
        ReadLogs bool
}

package logger

import (
        "context"

        "github.com/containerd/log"
        "golang.org/x/time/rate"
)

// Rates based on journald defaults of 10,000 messages in 30s.
// reference: https://www.freedesktop.org/software/systemd/man/journald.conf.html#RateLimitIntervalSec=
var logErrorLimiter = rate.NewLimiter(333, 333)

// logDriverError logs errors produced by log drivers to the daemon logs. It also increments the logWritesFailedCount
// metric.
// Logging to the daemon logs is limited to 333 operations per second at most. If this limit is exceeded, the
// logWritesFailedCount is still counted, but logging to the daemon logs is omitted in order to prevent disk saturation.
func logDriverError(loggerName, msgLine string, logErr error) {
        logWritesFailedCount.Inc(1)
        if logErrorLimiter.Allow() {
                log.G(context.TODO()).WithFields(log.Fields{
                        "error":   logErr,
                        "driver":  loggerName,
                        "message": msgLine,
                }).Error("Error writing log message")
        }
}

package cache

import (
        "context"
        "strconv"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/local"
        "github.com/docker/go-units"
        "github.com/moby/moby/api/types/container"
        "github.com/pkg/errors"
)

const (
        // DriverName is the name of the driver used for local log caching
        DriverName = local.Name

        cachePrefix      = "cache-"
        cacheDisabledKey = cachePrefix + "disabled"
)

var builtInCacheLogOpts = map[string]bool{
        cacheDisabledKey: true,
}

var _ logger.LogReader = (*loggerWithCache)(nil)

// WithLocalCache wraps the passed in logger with a logger caches all writes locally
// in addition to writing to the passed in logger.
func WithLocalCache(l logger.Logger, info logger.Info) (logger.Logger, error) {
        initLogger, err := logger.GetLogDriver(DriverName)
        if err != nil {
                return nil, err
        }

        cacher, err := initLogger(info)
        if err != nil {
                return nil, errors.Wrap(err, "error initializing local log cache driver")
        }

        if container.LogMode(info.Config["mode"]) == container.LogModeUnset || container.LogMode(info.Config["mode"]) == container.LogModeNonBlock {
                var size int64 = -1
                if s, exists := info.Config["max-buffer-size"]; exists {
                        size, err = units.RAMInBytes(s)
                        if err != nil {
                                return nil, err
                        }
                }
                cacher = logger.NewRingLogger(cacher, info, size)
        }

        return &loggerWithCache{
                l:     l,
                cache: cacher,
        }, nil
}

type loggerWithCache struct {
        l     logger.Logger
        cache logger.Logger
}

var _ logger.SizedLogger = &loggerWithCache{}

// BufSize returns the buffer size of the underlying logger.
// Returns -1 if the logger doesn't match SizedLogger interface.
func (l *loggerWithCache) BufSize() int {
        if sl, ok := l.l.(logger.SizedLogger); ok {
                return sl.BufSize()
        }
        return -1
}

func (l *loggerWithCache) Log(msg *logger.Message) error {
        // copy the message as the original will be reset once the call to `Log` is complete
        dup := logger.NewMessage()
        dumbCopyMessage(dup, msg)

        if err := l.l.Log(msg); err != nil {
                return err
        }
        return l.cache.Log(dup)
}

func (l *loggerWithCache) Name() string {
        return l.l.Name()
}

func (l *loggerWithCache) ReadLogs(ctx context.Context, config logger.ReadConfig) *logger.LogWatcher {
        return l.cache.(logger.LogReader).ReadLogs(ctx, config)
}

func (l *loggerWithCache) Close() error {
        err := l.l.Close()
        if err := l.cache.Close(); err != nil {
                log.G(context.TODO()).WithError(err).Warn("error while shutting cache logger")
        }
        return err
}

// ShouldUseCache reads the log opts to determine if caching should be enabled
func ShouldUseCache(cfg map[string]string) bool {
        if cfg[cacheDisabledKey] == "" {
                return true
        }
        b, err := strconv.ParseBool(cfg[cacheDisabledKey])
        if err != nil {
                // This shouldn't happen since the values are validated before hand.
                return false
        }
        return !b
}

// dumbCopyMessage is a bit of a fake copy but avoids extra allocations which
// are not necessary for this use case.
func dumbCopyMessage(dst, src *logger.Message) {
        dst.Source = src.Source
        dst.Timestamp = src.Timestamp
        dst.PLogMetaData = src.PLogMetaData
        dst.Err = src.Err
        dst.Attrs = src.Attrs
        dst.Line = append(dst.Line[:0], src.Line...)
}

package cache

import (
        "strconv"

        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/local"
        "github.com/pkg/errors"
)

func init() {
        for k, v := range local.LogOptKeys {
                builtInCacheLogOpts[cachePrefix+k] = v
        }
        logger.AddBuiltinLogOpts(builtInCacheLogOpts)
        logger.RegisterExternalValidator(validateLogCacheOpts)
}

func validateLogCacheOpts(cfg map[string]string) error {
        if v := cfg[cacheDisabledKey]; v != "" {
                _, err := strconv.ParseBool(v)
                if err != nil {
                        return errors.Errorf("invalid value for option %s: %s", cacheDisabledKey, cfg[cacheDisabledKey])
                }
        }
        return nil
}

// MergeDefaultLogConfig reads the default log opts and makes sure that any caching related keys that exist there are
// added to dst.
func MergeDefaultLogConfig(dst, defaults map[string]string) {
        for k, v := range defaults {
                if !builtInCacheLogOpts[k] {
                        continue
                }
                if _, exists := dst[k]; !exists {
                        dst[k] = v
                }
        }
}

//go:build !windows

package loggerutils

import "os"

func openFile(name string, flag int, perm os.FileMode) (*os.File, error) {
        return os.OpenFile(name, flag, perm)
}

func open(name string) (*os.File, error) {
        return os.Open(name)
}

func unlink(name string) error {
        return os.Remove(name)
}

package loggerutils

import (
        "context"
        "fmt"
        "io"
        "os"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/logger"
        "github.com/pkg/errors"
)

type follow struct {
        LogFile   *LogFile
        Watcher   *logger.LogWatcher
        Decoder   Decoder
        Forwarder *forwarder

        log *log.Entry
        c   chan logPos
}

// Do follows the log file as it is written, starting from f at read.
func (fl *follow) Do(ctx context.Context, f *os.File, read logPos) {
        fl.log = log.G(ctx).WithFields(log.Fields{
                "module": "logger",
                "file":   f.Name(),
        })
        // Optimization: allocate the write-notifications channel only once and
        // reuse it for multiple invocations of nextPos().
        fl.c = make(chan logPos, 1)

        defer func() {
                if err := f.Close(); err != nil && !errors.Is(err, os.ErrClosed) {
                        fl.log.WithError(err).Warn("error closing current log file")
                }
        }()

        for {
                wrote, ok := fl.nextPos(ctx, read)
                if !ok {
                        return
                }

                if wrote.rotation != read.rotation {
                        // Flush the current file before moving on to the next.
                        if _, err := f.Seek(read.size, io.SeekStart); err != nil {
                                fl.Watcher.Err <- err
                                return
                        }
                        if !fl.forward(ctx, f) {
                                return
                        }

                        // Open the new file, which has the same name as the old
                        // file thanks to file rotation. Make no mistake: they
                        // are different files, with distinct identities.
                        // Atomically capture the wrote position to make
                        // absolutely sure that the position corresponds to the
                        // file we have opened; more rotations could have
                        // occurred since we previously received it.
                        if err := f.Close(); err != nil {
                                fl.log.WithError(err).Warn("error closing rotated log file")
                        }
                        var err error
                        func() {
                                fl.LogFile.fsopMu.RLock()
                                st := <-fl.LogFile.read
                                defer func() {
                                        fl.LogFile.read <- st
                                        fl.LogFile.fsopMu.RUnlock()
                                }()
                                f, err = open(f.Name())
                                wrote = st.pos
                        }()
                        // We tried to open the file inside a critical section
                        // so we shouldn't have been racing the rotation of the
                        // file. Any error, even fs.ErrNotFound, is exceptional.
                        if err != nil {
                                fl.Watcher.Err <- fmt.Errorf("logger: error opening log file for follow after rotation: %w", err)
                                return
                        }

                        if nrot := wrote.rotation - read.rotation; nrot > 1 {
                                fl.log.WithField("missed-rotations", nrot).
                                        Warn("file rotations were missed while following logs; some log messages have been skipped over")
                        }

                        // Set up our read position to start from the top of the file.
                        read.size = 0
                }

                if !fl.forward(ctx, io.NewSectionReader(f, read.size, wrote.size-read.size)) {
                        return
                }
                read = wrote
        }
}

// nextPos waits until the write position of the LogFile being followed has
// advanced from current and returns the new position.
func (fl *follow) nextPos(ctx context.Context, current logPos) (next logPos, ok bool) {
        var st logReadState
        select {
        case <-ctx.Done():
                return current, false
        case <-fl.Watcher.WatchConsumerGone():
                return current, false
        case st = <-fl.LogFile.read:
        }

        // Have any logs been written since we last checked?
        if st.pos == current { // Nope.
                // Add ourself to the notify list.
                st.wait = append(st.wait, fl.c)
        } else { // Yes.
                // "Notify" ourself immediately.
                fl.c <- st.pos
        }
        fl.LogFile.read <- st

        select {
        case <-fl.LogFile.closed: // No more logs will be written.
                select { // Have we followed to the end?
                case next = <-fl.c: // No: received a new position.
                default: // Yes.
                        return current, false
                }
        case <-fl.Watcher.WatchConsumerGone():
                return current, false
        case next = <-fl.c:
        }
        return next, true
}

// forward decodes log messages from r and forwards them to the log watcher.
//
// The return value, cont, signals whether following should continue.
func (fl *follow) forward(ctx context.Context, r io.Reader) (cont bool) {
        fl.Decoder.Reset(r)
        return fl.Forwarder.Do(ctx, fl.Watcher, fl.Decoder.Decode)
}

package loggerutils

import (
        "bytes"

        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/daemon/logger/templates"
)

// DefaultTemplate defines the defaults template logger should use.
const DefaultTemplate = "{{.ID}}"

// ParseLogTag generates a context aware tag for consistency across different
// log drivers based on the context of the running container.
func ParseLogTag(info logger.Info, defaultTemplate string) (string, error) {
        tagTemplate := info.Config["tag"]
        if tagTemplate == "" {
                tagTemplate = defaultTemplate
        }

        tmpl, err := templates.NewParse("log-tag", tagTemplate)
        if err != nil {
                return "", err
        }
        buf := new(bytes.Buffer)
        if err := tmpl.Execute(buf, &info); err != nil {
                return "", err
        }

        return buf.String(), nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package loggerutils

import (
        "compress/gzip"
        "context"
        "encoding/json"
        "fmt"
        "io"
        "io/fs"
        "math"
        "os"
        "slices"
        "strconv"
        "sync"
        "time"

        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/logger"
        "github.com/docker/docker/pkg/pools"
        "github.com/pkg/errors"
        "go.opentelemetry.io/otel/attribute"
)

// rotateFileMetadata is a metadata of the gzip header of the compressed log file
type rotateFileMetadata struct {
        LastTime time.Time `json:"lastTime,omitempty"`
}

// LogFile is Logger implementation for default Docker logging.
type LogFile struct {
        mu       sync.Mutex // protects the logfile access
        closed   chan struct{}
        rotateMu sync.Mutex // blocks the next rotation until the current rotation is completed
        // Lock out readers while performing a non-atomic sequence of filesystem
        // operations (RLock: open, Lock: rename, delete).
        //
        // fsopMu should be locked for writing only while holding rotateMu.
        fsopMu sync.RWMutex

        // Logger configuration

        capacity int64 // maximum size of each file
        maxFiles int   // maximum number of files
        compress bool  // whether old versions of log files are compressed
        perms    os.FileMode

        // Log file codec

        createDecoder MakeDecoderFn
        getTailReader GetTailReaderFunc

        // Log reader state in a 1-buffered channel.
        //
        // Share memory by communicating: receive to acquire, send to release.
        // The state struct is passed around by value so that use-after-send
        // bugs cannot escalate to data races.
        //
        // A method which receives the state value takes ownership of it. The
        // owner is responsible for either passing ownership along or sending
        // the state back to the channel. By convention, the semantics of
        // passing along ownership is expressed with function argument types.
        // Methods which take a pointer *logReadState argument borrow the state,
        // analogous to functions which require a lock to be held when calling.
        // The caller retains ownership. Calling a method which takes a
        // value logFileState argument gives ownership to the callee.
        read chan logReadState

        decompress *sharedTempFileConverter

        pos           logPos    // Current log file write position.
        f             *os.File  // Current log file for writing.
        lastTimestamp time.Time // timestamp of the last log
}

type logPos struct {
        // Size of the current file.
        size int64
        // File rotation sequence number (modulo 2**16).
        rotation uint16
}

type logReadState struct {
        // Current log file position.
        pos logPos
        // Wait list to be notified of the value of pos next time it changes.
        wait []chan<- logPos
}

// MakeDecoderFn creates a decoder
type MakeDecoderFn func(rdr io.Reader) Decoder

// Decoder is for reading logs
// It is created by the log reader by calling the `MakeDecoderFunc`
type Decoder interface {
        // Reset resets the decoder
        // Reset is called for certain events, such as log rotations
        Reset(io.Reader)
        // Decode decodes the next log message from the stream
        Decode() (*logger.Message, error)
        // Close signals to the decoder that it can release whatever resources it was using.
        Close()
}

// SizeReaderAt defines a ReaderAt that also reports its size.
// This is used for tailing log files.
type SizeReaderAt interface {
        io.Reader
        io.ReaderAt
        Size() int64
}

// GetTailReaderFunc is used to truncate a reader to only read as much as is required
// in order to get the passed in number of log lines.
// It returns the sectioned reader, the number of lines that the section reader
// contains, and any error that occurs.
type GetTailReaderFunc func(ctx context.Context, f SizeReaderAt, nLogLines int) (rdr SizeReaderAt, nLines int, err error)

// NewLogFile creates new LogFile
func NewLogFile(logPath string, capacity int64, maxFiles int, compress bool, decodeFunc MakeDecoderFn, perms os.FileMode, getTailReader GetTailReaderFunc) (*LogFile, error) {
        logFile, err := openFile(logPath, os.O_WRONLY|os.O_APPEND|os.O_CREATE, perms)
        if err != nil {
                return nil, err
        }

        size, err := logFile.Seek(0, io.SeekEnd)
        if err != nil {
                return nil, err
        }

        pos := logPos{
                size: size,
                // Force a wraparound on first rotation to shake out any
                // modular-arithmetic bugs.
                rotation: math.MaxUint16,
        }
        st := make(chan logReadState, 1)
        st <- logReadState{pos: pos}

        return &LogFile{
                f:             logFile,
                read:          st,
                pos:           pos,
                closed:        make(chan struct{}),
                capacity:      capacity,
                maxFiles:      maxFiles,
                compress:      compress,
                decompress:    newSharedTempFileConverter(decompress),
                createDecoder: decodeFunc,
                perms:         perms,
                getTailReader: getTailReader,
        }, nil
}

// WriteLogEntry writes the provided log message to the current log file.
// This may trigger a rotation event if the max file/capacity limits are hit.
func (w *LogFile) WriteLogEntry(timestamp time.Time, marshalled []byte) error {
        select {
        case <-w.closed:
                return errors.New("cannot write because the output file was closed")
        default:
        }
        w.mu.Lock()
        defer w.mu.Unlock()

        // Are we due for a rotation?
        if w.capacity != -1 && w.pos.size >= w.capacity {
                if err := w.rotate(); err != nil {
                        return errors.Wrap(err, "error rotating log file")
                }
        }

        n, err := w.f.Write(marshalled)
        if err != nil {
                return errors.Wrap(err, "error writing log entry")
        }
        w.pos.size += int64(n)
        w.lastTimestamp = timestamp

        // Notify any waiting readers that there is a new log entry to read.
        st := <-w.read
        defer func() { w.read <- st }()
        st.pos = w.pos

        for _, c := range st.wait {
                c <- st.pos
        }
        // Optimization: retain the backing array to save a heap allocation next
        // time a reader appends to the list.
        if st.wait != nil {
                st.wait = st.wait[:0]
        }
        return nil
}

func (w *LogFile) rotate() (retErr error) {
        w.rotateMu.Lock()
        noCompress := w.maxFiles <= 1 || !w.compress
        defer func() {
                // If we aren't going to run the goroutine to compress the log file, then we need to unlock in this function.
                // Otherwise the lock will be released in the goroutine that handles compression.
                if retErr != nil || noCompress {
                        w.rotateMu.Unlock()
                }
        }()

        fname := w.f.Name()
        if err := w.f.Close(); err != nil {
                // if there was an error during a prior rotate, the file could already be closed
                if !errors.Is(err, fs.ErrClosed) {
                        return errors.Wrap(err, "error closing file")
                }
        }

        file, err := func() (*os.File, error) {
                w.fsopMu.Lock()
                defer w.fsopMu.Unlock()

                if err := rotate(fname, w.maxFiles, w.compress); err != nil {
                        log.G(context.TODO()).WithError(err).Warn("Error rotating log file, log data may have been lost")
                } else {
                        // We may have readers working their way through the
                        // current log file so we can't truncate it. We need to
                        // start writing new logs to an empty file with the same
                        // name as the current one so we need to rotate the
                        // current file out of the way.
                        if w.maxFiles < 2 {
                                if err := unlink(fname); err != nil && !errors.Is(err, fs.ErrNotExist) {
                                        log.G(context.TODO()).WithError(err).Error("Error unlinking current log file")
                                }
                        } else {
                                if err := os.Rename(fname, fname+".1"); err != nil && !errors.Is(err, fs.ErrNotExist) {
                                        log.G(context.TODO()).WithError(err).Error("Error renaming current log file")
                                }
                        }
                }

                // Notwithstanding the above, open with the truncate flag anyway
                // in case rotation didn't work out as planned.
                return openFile(fname, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, w.perms)
        }()
        if err != nil {
                return err
        }
        w.f = file
        w.pos = logPos{rotation: w.pos.rotation + 1}

        if noCompress {
                return nil
        }

        ts := w.lastTimestamp
        go func() {
                defer w.rotateMu.Unlock()
                // No need to hold fsopMu as at no point will the filesystem be
                // in a state which would cause problems for readers. Opening
                // the uncompressed file is tried first, falling back to the
                // compressed one. compressFile only deletes the uncompressed
                // file once the compressed one is fully written out, so at no
                // point during the compression process will a reader fail to
                // open a complete copy of the file.
                if err := compressFile(fname+".1", ts); err != nil {
                        log.G(context.TODO()).WithError(err).Error("Error compressing log file after rotation")
                }
        }()

        return nil
}

func rotate(name string, maxFiles int, compress bool) error {
        if maxFiles < 2 {
                return nil
        }

        var extension string
        if compress {
                extension = ".gz"
        }

        lastFile := fmt.Sprintf("%s.%d%s", name, maxFiles-1, extension)
        err := unlink(lastFile)
        if err != nil && !errors.Is(err, fs.ErrNotExist) {
                return errors.Wrap(err, "error removing oldest log file")
        }

        for i := maxFiles - 1; i > 1; i-- {
                toPath := name + "." + strconv.Itoa(i) + extension
                fromPath := name + "." + strconv.Itoa(i-1) + extension
                err := os.Rename(fromPath, toPath)
                log.G(context.TODO()).WithError(err).WithField("source", fromPath).WithField("target", toPath).Trace("Rotating log file")
                if err != nil && !errors.Is(err, fs.ErrNotExist) {
                        return err
                }
        }

        return nil
}

func compressFile(fileName string, lastTimestamp time.Time) (retErr error) {
        file, err := open(fileName)
        if err != nil {
                if errors.Is(err, fs.ErrNotExist) {
                        log.G(context.TODO()).WithField("file", fileName).WithError(err).Debug("Could not open log file to compress")
                        return nil
                }
                return errors.Wrap(err, "failed to open log file")
        }
        defer func() {
                file.Close()
                if retErr == nil {
                        err := unlink(fileName)
                        if err != nil && !errors.Is(err, fs.ErrNotExist) {
                                retErr = errors.Wrap(err, "failed to remove source log file")
                        }
                }
        }()

        outFile, err := openFile(fileName+".gz", os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0o640)
        if err != nil {
                return errors.Wrap(err, "failed to open or create gzip log file")
        }
        defer func() {
                outFile.Close()
                if retErr != nil {
                        if err := unlink(fileName + ".gz"); err != nil && !errors.Is(err, fs.ErrNotExist) {
                                log.G(context.TODO()).WithError(err).Error("Error cleaning up after failed log compression")
                        }
                }
        }()

        compressWriter := gzip.NewWriter(outFile)
        defer compressWriter.Close()

        // Add the last log entry timestamp to the gzip header
        extra := rotateFileMetadata{}
        extra.LastTime = lastTimestamp
        compressWriter.Header.Extra, err = json.Marshal(&extra)
        if err != nil {
                // Here log the error only and don't return since this is just an optimization.
                log.G(context.TODO()).Warningf("Failed to marshal gzip header as JSON: %v", err)
        }

        _, err = pools.Copy(compressWriter, file)
        if err != nil {
                return errors.Wrapf(err, "error compressing log file %s", fileName)
        }

        return nil
}

// MaxFiles return maximum number of files
func (w *LogFile) MaxFiles() int {
        return w.maxFiles
}

// Close closes underlying file and signals all readers to stop.
func (w *LogFile) Close() error {
        w.mu.Lock()
        defer w.mu.Unlock()
        select {
        case <-w.closed:
                return nil
        default:
        }
        if err := w.f.Close(); err != nil && !errors.Is(err, fs.ErrClosed) {
                return err
        }
        close(w.closed)
        // Wait until any in-progress rotation is complete.
        w.rotateMu.Lock()
        defer w.rotateMu.Unlock()
        return nil
}

// ReadLogs decodes entries from log files.
//
// It is the caller's responsibility to call ConsumerGone on the LogWatcher.
func (w *LogFile) ReadLogs(ctx context.Context, config logger.ReadConfig) *logger.LogWatcher {
        ctx, span := tracing.StartSpan(ctx, "logger.LogFile.ReadLogs")
        defer span.End()

        span.SetAttributes(tracing.Attribute("config", config))

        watcher := logger.NewLogWatcher()
        // Lock out filesystem operations so that we can capture the read
        // position and atomically open the corresponding log file, without the
        // file getting rotated out from under us.
        w.fsopMu.RLock()
        // Capture the read position synchronously to ensure that we start
        // following from the last entry logged before ReadLogs was called,
        // which is required for flake-free unit testing.
        st := <-w.read
        pos := st.pos
        w.read <- st
        go w.readLogsLocked(ctx, pos, config, watcher)
        return watcher
}

// tailFiles must be called with w.fsopMu locked for reads.
// w.fsopMu.RUnlock() is called before returning.
func (w *LogFile) tailFiles(ctx context.Context, config logger.ReadConfig, watcher *logger.LogWatcher, current SizeReaderAt, dec Decoder, fwd *forwarder) (cont bool) {
        if config.Tail == 0 {
                w.fsopMu.RUnlock()
                return true
        }

        ctx, span := tracing.StartSpan(ctx, "logger.Logfile.TailLogs")
        defer func() {
                span.SetAttributes(attribute.Bool("continue", cont))
                span.End()
        }()

        files, err := w.openRotatedFiles(ctx, config)
        w.fsopMu.RUnlock()

        if err != nil {
                // TODO: Should we allow this to continue (as in set `cont=true`) and not error out the log stream?
                err = errors.Wrap(err, "error opening rotated log files")
                span.SetStatus(err)
                watcher.Err <- err
                return false
        }

        if current.Size() > 0 {
                files = append(files, &sizeReaderAtOpener{current, "current"})
        }

        return tailFiles(ctx, files, watcher, dec, w.getTailReader, config.Tail, fwd)
}

type sizeReaderAtOpener struct {
        SizeReaderAt
        ref string
}

func (o *sizeReaderAtOpener) ReaderAt(context.Context) (sizeReaderAtCloser, error) {
        return &sizeReaderAtWithCloser{o, nil}, nil
}

func (o *sizeReaderAtOpener) Close() {}

func (o *sizeReaderAtOpener) Ref() string {
        return o.ref
}

type sizeReaderAtWithCloser struct {
        SizeReaderAt
        close func() error
}

func (r *sizeReaderAtWithCloser) ReadAt(p []byte, offset int64) (int, error) {
        if r.SizeReaderAt == nil {
                return 0, io.EOF
        }
        return r.SizeReaderAt.ReadAt(p, offset)
}

func (r *sizeReaderAtWithCloser) Read(p []byte) (int, error) {
        if r.SizeReaderAt == nil {
                return 0, io.EOF
        }
        return r.SizeReaderAt.Read(p)
}

func (r *sizeReaderAtWithCloser) Size() int64 {
        if r.SizeReaderAt == nil {
                return 0
        }
        return r.SizeReaderAt.Size()
}

func (r *sizeReaderAtWithCloser) Close() error {
        if r.close != nil {
                return r.close()
        }
        return nil
}

// readLogsLocked is the bulk of the implementation of ReadLogs.
//
// w.fsopMu must be locked for reading when calling this method.
// w.fsopMu.RUnlock() is called before returning.
func (w *LogFile) readLogsLocked(ctx context.Context, currentPos logPos, config logger.ReadConfig, watcher *logger.LogWatcher) {
        ctx, span := tracing.StartSpan(ctx, "logger.Logfile.ReadLogsLocked")
        defer span.End()

        defer close(watcher.Msg)

        currentFile, err := open(w.f.Name())
        if err != nil {
                w.fsopMu.RUnlock()
                span.SetStatus(err)
                watcher.Err <- err
                return
        }
        defer currentFile.Close()

        dec := w.createDecoder(nil)
        defer dec.Close()

        fwd := newForwarder(config)

        // At this point, w.tailFiles is responsible for unlocking w.fsopmu
        ok := w.tailFiles(ctx, config, watcher, io.NewSectionReader(currentFile, 0, currentPos.size), dec, fwd)

        if !ok {
                return
        }

        if !config.Follow {
                return
        }

        (&follow{
                LogFile:   w,
                Watcher:   watcher,
                Decoder:   dec,
                Forwarder: fwd,
        }).Do(ctx, currentFile, currentPos)
}

type fileOpener interface {
        ReaderAt(context.Context) (ra sizeReaderAtCloser, err error)
        Close()
        Ref() string
}

// simpleFileOpener just holds a reference to an already open file
type simpleFileOpener struct {
        f      *os.File
        sz     int64
        closed bool
}

func (o *simpleFileOpener) ReaderAt(context.Context) (sizeReaderAtCloser, error) {
        if o.closed {
                return nil, errors.New("file is closed")
        }

        if o.sz == 0 {
                stat, err := o.f.Stat()
                if err != nil {
                        return nil, errors.Wrap(err, "error stating file")
                }
                o.sz = stat.Size()
        }
        return &sizeReaderAtWithCloser{io.NewSectionReader(o.f, 0, o.sz), nil}, nil
}

func (o *simpleFileOpener) Ref() string {
        return o.f.Name()
}

func (o *simpleFileOpener) Close() {
        _ = o.f.Close()
        o.closed = true
}

// converter function used by shareTempFileConverter
func decompress(dst io.WriteSeeker, src io.ReadSeeker) error {
        if _, err := src.Seek(0, io.SeekStart); err != nil {
                return err
        }
        rc, err := gzip.NewReader(src)
        if err != nil {
                return err
        }
        _, err = pools.Copy(dst, rc)
        if err != nil {
                return err
        }
        return rc.Close()
}

// compressedFileOpener holds a reference to compressed a log file and will
// lazily open a decompressed version of the file.
type compressedFileOpener struct {
        closed bool

        f *os.File

        lf       *LogFile
        ifBefore time.Time
}

func (cfo *compressedFileOpener) ReaderAt(ctx context.Context) (_ sizeReaderAtCloser, retErr error) {
        _, span := tracing.StartSpan(ctx, "logger.Logfile.Compressed.ReaderAt")
        defer func() {
                if retErr != nil {
                        span.SetStatus(retErr)
                }
                span.End()
        }()

        span.SetAttributes(attribute.String("file", cfo.f.Name()))

        if cfo.closed {
                return nil, errors.New("compressed file closed")
        }

        gzr, err := gzip.NewReader(cfo.f)
        if err != nil {
                return nil, err
        }
        defer gzr.Close()

        // Extract the last log entry timestamp from the gzip header
        // Use this to determine if we even need to read this file based on inputs
        extra := &rotateFileMetadata{}
        err = json.Unmarshal(gzr.Header.Extra, extra)
        if err == nil && !extra.LastTime.IsZero() && extra.LastTime.Before(cfo.ifBefore) {
                span.SetAttributes(attribute.Bool("skip", true))
                return &sizeReaderAtWithCloser{}, nil
        }
        if err == nil {
                span.SetAttributes(attribute.Stringer("lastLogTime", extra.LastTime))
        }

        span.AddEvent("Start decompress")
        return cfo.lf.decompress.Do(cfo.f)
}

func (cfo *compressedFileOpener) Close() {
        cfo.closed = true
        cfo.f.Close()
}

func (cfo *compressedFileOpener) Ref() string {
        return cfo.f.Name()
}

type emptyFileOpener struct{}

func (emptyFileOpener) ReaderAt(context.Context) (sizeReaderAtCloser, error) {
        return &sizeReaderAtWithCloser{}, nil
}

func (emptyFileOpener) Close() {}

func (emptyFileOpener) Ref() string {
        return "null"
}

// openRotatedFiles returns a slice of files open for reading, in order from
// oldest to newest, and calls w.fsopMu.RUnlock() before returning.
//
// This method must only be called with w.fsopMu locked for reading.
func (w *LogFile) openRotatedFiles(ctx context.Context, config logger.ReadConfig) (_ []fileOpener, retErr error) {
        var out []fileOpener

        defer func() {
                if retErr != nil {
                        for _, fo := range out {
                                fo.Close()
                        }
                }
        }()

        for i := w.maxFiles; i > 1; i-- {
                fo, err := w.openRotatedFile(ctx, i-1, config)
                if err != nil {
                        return nil, err
                }
                out = append(out, fo)
        }

        return out, nil
}

func (w *LogFile) openRotatedFile(ctx context.Context, i int, config logger.ReadConfig) (fileOpener, error) {
        f, err := open(fmt.Sprintf("%s.%d", w.f.Name(), i))
        if err == nil {
                return &simpleFileOpener{
                        f: f,
                }, nil
        }

        if !errors.Is(err, fs.ErrNotExist) {
                return nil, errors.Wrap(err, "error opening rotated log file")
        }

        f, err = open(fmt.Sprintf("%s.%d.gz", w.f.Name(), i))
        if err != nil {
                if !errors.Is(err, fs.ErrNotExist) {
                        return nil, errors.Wrap(err, "error opening file for decompression")
                }
                return &emptyFileOpener{}, nil
        }

        return &compressedFileOpener{
                f:        f,
                lf:       w,
                ifBefore: config.Since,
        }, nil
}

// This is used to improve type safety around tailing logs
// Some log readers require the log file to be closed, so this makes sure all
// implementers have a closer even if it may be a no-op.
// This is opposed to asserting a type.
type sizeReaderAtCloser interface {
        SizeReaderAt
        io.Closer
}

func getTailFiles(ctx context.Context, files []fileOpener, nLines int, getTailReader GetTailReaderFunc) (_ []sizeReaderAtCloser, retErr error) {
        ctx, span := tracing.StartSpan(ctx, "logger.Logfile.CollectTailFiles")
        span.SetAttributes(attribute.Int("requested_lines", nLines))

        defer func() {
                if retErr != nil {
                        span.SetStatus(retErr)
                }
                span.End()
        }()
        out := make([]sizeReaderAtCloser, 0, len(files))

        defer func() {
                if retErr != nil {
                        for _, ra := range out {
                                if err := ra.Close(); err != nil {
                                        log.G(ctx).WithError(err).Warn("Error closing log reader")
                                }
                        }
                }
        }()

        if nLines <= 0 {
                for _, fo := range files {
                        span.AddEvent("Open file", attribute.String("file", fo.Ref()))

                        ra, err := fo.ReaderAt(ctx)
                        if err != nil {
                                return nil, err
                        }
                        out = append(out, ra)

                }
                return out, nil
        }

        for i := len(files) - 1; i >= 0 && nLines > 0; i-- {
                if err := ctx.Err(); err != nil {
                        return nil, errors.Wrap(err, "stopping parsing files to tail due to error")
                }

                fo := files[i]

                fileAttr := attribute.String("file", fo.Ref())
                span.AddEvent("Open file", fileAttr)

                ra, err := fo.ReaderAt(ctx)
                if err != nil {
                        return nil, err
                }

                span.AddEvent("Scan file to tail", fileAttr, attribute.Int("remaining_lines", nLines))

                tail, n, err := getTailReader(ctx, ra, nLines)
                if err != nil {
                        ra.Close()
                        log.G(ctx).WithError(err).Warn("Error scanning log file for tail file request, skipping")
                        continue
                }
                nLines -= n
                out = append(out, &sizeReaderAtWithCloser{tail, ra.Close})
        }

        slices.Reverse(out)

        return out, nil
}

func tailFiles(ctx context.Context, files []fileOpener, watcher *logger.LogWatcher, dec Decoder, getTailReader GetTailReaderFunc, nLines int, fwd *forwarder) (cont bool) {
        ctx, cancel := context.WithCancel(ctx)
        defer cancel()

        go func() {
                select {
                case <-ctx.Done():
                case <-watcher.WatchConsumerGone():
                        cancel()
                }
        }()

        readers, err := getTailFiles(ctx, files, nLines, getTailReader)
        if err != nil {
                watcher.Err <- err
                return false
        }

        var idx int
        defer func() {
                // Make sure all are released if there is an early return.
                if !cont {
                        for _, r := range readers[idx:] {
                                if err := r.Close(); err != nil {
                                        log.G(ctx).WithError(err).Debug("Error closing log reader")
                                }
                        }
                }
        }()

        for _, ra := range readers {
                select {
                case <-watcher.WatchConsumerGone():
                        return false
                case <-ctx.Done():
                        return false
                default:
                }

                dec.Reset(ra)

                cancel := context.AfterFunc(ctx, func() {
                        if err := ra.Close(); err != nil {
                                log.G(ctx).WithError(err).Debug("Error closing log reader")
                        }
                })

                ok := fwd.Do(ctx, watcher, func() (*logger.Message, error) {
                        msg, err := dec.Decode()
                        if err != nil && !errors.Is(err, io.EOF) {
                                // We have an error decoding the stream, but we don't want to error out
                                // the whole log reader.
                                // If we return anything other than EOF then the forwarder will return
                                // false and we'll exit the loop.
                                // Instead just log the error here and return an EOF so we can move to
                                // the next file.
                                log.G(ctx).WithError(err).Warn("Error decoding log file")
                                return nil, io.EOF
                        }
                        return msg, err
                })
                cancel()
                idx++
                if !ok {
                        return false
                }
        }

        return true
}

type forwarder struct {
        since, until time.Time
}

func newForwarder(config logger.ReadConfig) *forwarder {
        return &forwarder{since: config.Since, until: config.Until}
}

// Do reads log messages from dec and sends the messages matching the filter
// conditions to watcher. Do returns cont=true iff it has read all messages from
// dec without encountering a message with a timestamp which is after the
// configured until time.
func (fwd *forwarder) Do(ctx context.Context, watcher *logger.LogWatcher, next func() (*logger.Message, error)) (cont bool) {
        ctx, span := tracing.StartSpan(ctx, "logger.Logfile.Forward")
        defer func() {
                span.SetAttributes(attribute.Bool("continue", cont))
                span.End()
        }()

        for {
                select {
                case <-watcher.WatchConsumerGone():
                        span.AddEvent("watch consumer gone")
                        return false
                case <-ctx.Done():
                        span.AddEvent(ctx.Err().Error())
                        return false
                default:
                }

                msg, err := next()
                if err != nil {
                        if errors.Is(err, io.EOF) {
                                span.AddEvent("EOF")
                                return true
                        }
                        span.SetStatus(err)
                        log.G(ctx).WithError(err).Debug("Error while decoding log entry, not continuing")
                        return false
                }

                if !fwd.since.IsZero() {
                        if msg.Timestamp.Before(fwd.since) {
                                continue
                        }
                        // We've found our first message with a timestamp >= since. As message
                        // timestamps might not be monotonic, we need to skip the since check for all
                        // subsequent messages so we do not filter out later messages which happen to
                        // have timestamps before since.
                        fwd.since = time.Time{}
                }
                if !fwd.until.IsZero() && msg.Timestamp.After(fwd.until) {
                        log.G(ctx).Debug("Log is newer than requested window, skipping remaining logs")
                        return false
                }

                select {
                case <-ctx.Done():
                        span.AddEvent(ctx.Err().Error())
                        return false
                case <-watcher.WatchConsumerGone():
                        span.AddEvent("watch consumer gone")
                        return false
                case watcher.Msg <- msg:
                }
        }
}

package loggerutils

import (
        "context"
        "sync"

        "github.com/docker/docker/daemon/logger"
        "github.com/pkg/errors"
)

// MessageQueue is a queue for log messages.
//
// [MessageQueue.Enqueue] will block when the queue is full.
// To dequeue messages call [MessageQueue.Receiver] and pull messages off the
// returned channel.
//
// Closing only prevents new messages from being added to the queue.
// The queue can still be drained after close.
//
// The zero value of MessageQueue is safe to use, but does not do any internal
// buffering (queue size is 0).
type MessageQueue struct {
        maxSize int

        mu      sync.Mutex
        closing bool
        closed  chan struct{}

        // Blocks multiple calls to [MessageQueue.Close] until the queue is actually closed
        closeWait chan struct{}

        // We need to be able to safely close the send channel so that [MessageQueue.Dequeue]
        // can drain the queue without blocking.
        // This cond var helps deal with that.
        cond        *sync.Cond
        sendWaiters int

        ch chan *logger.Message
}

// NewMessageQueue creates a new queue with the specified size.
func NewMessageQueue(maxSize int) *MessageQueue {
        var q MessageQueue
        q.maxSize = maxSize
        q.init()
        return &q
}

func (q *MessageQueue) init() {
        if q.cond == nil {
                q.cond = sync.NewCond(&q.mu)
        }

        if q.ch == nil {
                q.ch = make(chan *logger.Message, q.maxSize)
        }

        if q.closed == nil {
                q.closed = make(chan struct{})
        }

        if q.closeWait == nil {
                q.closeWait = make(chan struct{})
        }
}

var ErrQueueClosed = errors.New("queue is closed")

// Enqueue adds the provided message to the queue.
// Enqueue blocks if the queue is full.
//
// The two possible error cases are:
// 1. The provided context is cancelled
// 2. [ErrQueueClosed] when the queue has been closed.
func (q *MessageQueue) Enqueue(ctx context.Context, m *logger.Message) error {
        q.mu.Lock()
        q.init()

        // Increment the waiter count
        // This prevents the send channel from being closed while we are trying to send.
        q.sendWaiters++
        q.mu.Unlock()

        defer func() {
                q.mu.Lock()
                // Decrement the waiter count and signal to any potential closer to check
                // the wait count again.
                // Only bother signaling if this is the last waiter.
                q.sendWaiters--
                if q.sendWaiters == 0 {
                        q.cond.Signal()
                }
                q.mu.Unlock()
        }()

        // Before trying to send on the channel, check if we care closed.
        select {
        case <-ctx.Done():
                return ctx.Err()
        case <-q.closed:
                return ErrQueueClosed
        default:
        }

        select {
        case <-ctx.Done():
                return ctx.Err()
        case <-q.closed:
                return ErrQueueClosed
        case q.ch <- m:
                return nil
        }
}

// Close prevents any new messages from being added to the queue.
func (q *MessageQueue) Close() {
        q.mu.Lock()

        q.init()

        if q.closing {
                // unlock the mutex here so that the goroutine waiting on the cond var can
                // take the lock when signaled.
                q.mu.Unlock()
                <-q.closeWait
                return
        }

        defer q.mu.Unlock()

        // Prevent multiple Close calls from trying to close things.
        q.closing = true

        close(q.closed)

        // Wait for any senders to finish
        // Because we closed the channel above, this shouldn't block for a long period.
        for q.sendWaiters > 0 {
                q.cond.Wait()
        }

        close(q.ch)
        close(q.closeWait)
}

// Receiver returns a channel that can be used to dequeue messages
// The channel will be closed when the message queue is closed but may have
// messages buffered.
func (q *MessageQueue) Receiver() <-chan *logger.Message {
        q.mu.Lock()
        defer q.mu.Unlock()

        q.init()

        return q.ch
}

package loggerutils

import (
        "io"
        "io/fs"
        "os"
        "runtime"
)

type fileConvertFn func(dst io.WriteSeeker, src io.ReadSeeker) error

type stfID uint64

// sharedTempFileConverter converts files using a user-supplied function and
// writes the results to temporary files which are automatically cleaned up on
// close. If another request is made to convert the same file, the conversion
// result and temporary file are reused if they have not yet been cleaned up.
//
// A file is considered the same as another file using the os.SameFile function,
// which compares file identity (e.g. device and inode numbers on Linux) and is
// robust to file renames. Input files are assumed to be immutable; no attempt
// is made to ascertain whether the file contents have changed between requests.
//
// One file descriptor is used per source file, irrespective of the number of
// concurrent readers of the converted contents.
type sharedTempFileConverter struct {
        // The directory where temporary converted files are to be written to.
        // If set to the empty string, the default directory for temporary files
        // is used.
        TempDir string

        conv fileConvertFn
        st   chan stfcState
}

type stfcState struct {
        fl     map[stfID]sharedTempFile
        nextID stfID
}

type sharedTempFile struct {
        src  os.FileInfo // Info about the source file for path-independent identification with os.SameFile.
        fd   *os.File
        size int64
        ref  int                       // Reference count of open readers on the temporary file.
        wait []chan<- stfConvertResult // Wait list for the conversion to complete.
}

type stfConvertResult struct {
        fr  *sharedFileReader
        err error
}

func newSharedTempFileConverter(conv fileConvertFn) *sharedTempFileConverter {
        st := make(chan stfcState, 1)
        st <- stfcState{fl: make(map[stfID]sharedTempFile)}
        return &sharedTempFileConverter{conv: conv, st: st}
}

// Do returns a reader for the contents of f as converted by the c.C function.
// It is the caller's responsibility to close the returned reader.
//
// This function is safe for concurrent use by multiple goroutines.
func (c *sharedTempFileConverter) Do(f *os.File) (*sharedFileReader, error) {
        stat, err := f.Stat()
        if err != nil {
                return nil, err
        }

        st := <-c.st
        for id, tf := range st.fl {
                // os.SameFile can have false positives if one of the files was
                // deleted before the other file was created -- such as during
                // log rotations... https://github.com/golang/go/issues/36895
                // Weed out those false positives by also comparing the files'
                // ModTime, which conveniently also handles the case of true
                // positives where the file has also been modified since it was
                // first converted.
                if os.SameFile(tf.src, stat) && tf.src.ModTime().Equal(stat.ModTime()) {
                        return c.openExisting(st, id, tf)
                }
        }
        return c.openNew(st, f, stat)
}

func (c *sharedTempFileConverter) openNew(st stfcState, f *os.File, stat os.FileInfo) (*sharedFileReader, error) {
        // Record that we are starting to convert this file so that any other
        // requests for the same source file while the conversion is in progress
        // can join.
        id := st.nextID
        st.nextID++
        st.fl[id] = sharedTempFile{src: stat}
        c.st <- st

        dst, size, convErr := c.convert(f)

        st = <-c.st
        flid := st.fl[id]

        if convErr != nil {
                // Conversion failed. Delete it from the state so that future
                // requests to convert the same file can try again fresh.
                delete(st.fl, id)
                c.st <- st
                for _, w := range flid.wait {
                        w <- stfConvertResult{err: convErr}
                }
                return nil, convErr
        }

        flid.fd = dst
        flid.size = size
        flid.ref = len(flid.wait) + 1
        for _, w := range flid.wait {
                // Each waiter needs its own reader with an independent read pointer.
                w <- stfConvertResult{fr: flid.Reader(c, id)}
        }
        flid.wait = nil
        st.fl[id] = flid
        c.st <- st
        return flid.Reader(c, id), nil
}

func (c *sharedTempFileConverter) openExisting(st stfcState, id stfID, v sharedTempFile) (*sharedFileReader, error) {
        if v.fd != nil {
                // Already converted.
                v.ref++
                st.fl[id] = v
                c.st <- st
                return v.Reader(c, id), nil
        }
        // The file has not finished being converted.
        // Add ourselves to the wait list. "Don't call us; we'll call you."
        wait := make(chan stfConvertResult, 1)
        v.wait = append(v.wait, wait)
        st.fl[id] = v
        c.st <- st

        res := <-wait
        return res.fr, res.err
}

func (c *sharedTempFileConverter) convert(f *os.File) (converted *os.File, size int64, _ error) {
        dst, err := os.CreateTemp(c.TempDir, "dockerdtemp.*")
        if err != nil {
                return nil, 0, err
        }
        defer func() {
                _ = dst.Close()
                // Delete the temporary file immediately so that final cleanup
                // of the file on disk is deferred to the OS once we close all
                // our file descriptors (or the process dies). Assuming no early
                // returns due to errors, the file will be open by this process
                // with a read-only descriptor at this point. As we don't care
                // about being able to reuse the file name -- it's randomly
                // generated and unique -- we can safely use os.Remove on
                // Windows.
                _ = os.Remove(dst.Name())
        }()
        err = c.conv(dst, f)
        if err != nil {
                return nil, 0, err
        }
        // Close the exclusive read-write file descriptor, catching any delayed
        // write errors (and on Windows, releasing the share-locks on the file)
        if err := dst.Close(); err != nil {
                _ = os.Remove(dst.Name())
                return nil, 0, err
        }
        // Open the file again read-only (without locking the file against
        // deletion on Windows).
        converted, err = open(dst.Name())
        if err != nil {
                return nil, 0, err
        }

        // The position of the file's read pointer doesn't matter as all readers
        // will be accessing the file through its io.ReaderAt interface.
        size, err = converted.Seek(0, io.SeekEnd)
        if err != nil {
                _ = converted.Close()
                return nil, 0, err
        }
        return converted, size, nil
}

type sharedFileReader struct {
        *io.SectionReader

        c      *sharedTempFileConverter
        id     stfID
        closed bool
}

func (stf sharedTempFile) Reader(c *sharedTempFileConverter, id stfID) *sharedFileReader {
        rdr := &sharedFileReader{SectionReader: io.NewSectionReader(stf.fd, 0, stf.size), c: c, id: id}
        runtime.SetFinalizer(rdr, (*sharedFileReader).Close)
        return rdr
}

func (r *sharedFileReader) Close() error {
        if r.closed {
                return fs.ErrClosed
        }

        st := <-r.c.st
        flid, ok := st.fl[r.id]
        if !ok {
                panic("invariant violation: temp file state missing from map")
        }
        flid.ref--
        lastRef := flid.ref <= 0
        if lastRef {
                delete(st.fl, r.id)
        } else {
                st.fl[r.id] = flid
        }
        r.closed = true
        r.c.st <- st

        if lastRef {
                return flid.fd.Close()
        }
        runtime.SetFinalizer(r, nil)
        return nil
}

package logger

import (
        "fmt"
        "os"
        "regexp"
        "strings"
        "time"
)

// Info provides enough information for a logging driver to do its function.
type Info struct {
        Config              map[string]string
        ContainerID         string
        ContainerName       string
        ContainerEntrypoint string
        ContainerArgs       []string
        ContainerImageID    string
        ContainerImageName  string
        ContainerCreated    time.Time
        ContainerEnv        []string
        ContainerLabels     map[string]string
        LogPath             string
        DaemonName          string
}

// ExtraAttributes returns the user-defined extra attributes (labels,
// environment variables) in key-value format. This can be used by log drivers
// that support metadata to add more context to a log.
func (info *Info) ExtraAttributes(keyMod func(string) string) (map[string]string, error) {
        extra := make(map[string]string)

        if labels, ok := info.Config["labels"]; ok && labels != "" {
                for _, l := range strings.Split(labels, ",") {
                        if v, ok := info.ContainerLabels[l]; ok {
                                if keyMod != nil {
                                        l = keyMod(l)
                                }
                                extra[l] = v
                        }
                }
        }

        if labelsRegex, ok := info.Config["labels-regex"]; ok && labelsRegex != "" {
                re, err := regexp.Compile(labelsRegex)
                if err != nil {
                        return nil, err
                }
                for k, v := range info.ContainerLabels {
                        if re.MatchString(k) {
                                if keyMod != nil {
                                        k = keyMod(k)
                                }
                                extra[k] = v
                        }
                }
        }

        envMapping := make(map[string]string)
        for _, e := range info.ContainerEnv {
                if k, v, ok := strings.Cut(e, "="); ok {
                        envMapping[k] = v
                }
        }

        // Code below is only to handle adding attributes based on env-vars.
        if len(envMapping) == 0 {
                return extra, nil
        }

        if env, ok := info.Config["env"]; ok && env != "" {
                for _, l := range strings.Split(env, ",") {
                        if v, ok := envMapping[l]; ok {
                                if keyMod != nil {
                                        l = keyMod(l)
                                }
                                extra[l] = v
                        }
                }
        }

        if envRegex, ok := info.Config["env-regex"]; ok && envRegex != "" {
                re, err := regexp.Compile(envRegex)
                if err != nil {
                        return nil, err
                }
                for k, v := range envMapping {
                        if re.MatchString(k) {
                                if keyMod != nil {
                                        k = keyMod(k)
                                }
                                extra[k] = v
                        }
                }
        }

        return extra, nil
}

// Hostname returns the hostname from the underlying OS.
func (info *Info) Hostname() (string, error) {
        hostname, err := os.Hostname()
        if err != nil {
                return "", fmt.Errorf("logger: can not resolve hostname: %v", err)
        }
        return hostname, nil
}

// Command returns the command that the container being logged was
// started with. The Entrypoint is prepended to the container
// arguments.
func (info *Info) Command() string {
        terms := []string{info.ContainerEntrypoint}
        terms = append(terms, info.ContainerArgs...)
        command := strings.Join(terms, " ")
        return command
}

// ID Returns the Container ID shortened to 12 characters.
func (info *Info) ID() string {
        return info.ContainerID[:12]
}

// FullID is an alias of ContainerID.
func (info *Info) FullID() string {
        return info.ContainerID
}

// Name returns the ContainerName without a preceding '/'.
func (info *Info) Name() string {
        return strings.TrimPrefix(info.ContainerName, "/")
}

// ImageID returns the ContainerImageID shortened to 12 characters.
func (info *Info) ImageID() string {
        return info.ContainerImageID[:12]
}

// ImageFullID is an alias of ContainerImageID.
func (info *Info) ImageFullID() string {
        return info.ContainerImageID
}

// ImageName is an alias of ContainerImageName
func (info *Info) ImageName() string {
        return info.ContainerImageName
}

package logger

import (
        gometrics "github.com/docker/go-metrics"
)

var (
        logWritesFailedCount gometrics.Counter
        logReadsFailedCount  gometrics.Counter
        totalPartialLogs     gometrics.Counter
)

func init() {
        loggerMetrics := gometrics.NewNamespace("logger", "", nil)

        logWritesFailedCount = loggerMetrics.NewCounter("log_write_operations_failed", "Number of log write operations that failed")
        logReadsFailedCount = loggerMetrics.NewCounter("log_read_operations_failed", "Number of log reads from container stdio that failed")
        totalPartialLogs = loggerMetrics.NewCounter("log_entries_size_greater_than_buffer", "Number of log entries which are larger than the log buffer")

        gometrics.Register(loggerMetrics)
}

package logger

import (
        "fmt"
        "io"
        "os"
        "path/filepath"

        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/moby/api/types/plugins/logdriver"
        "github.com/pkg/errors"
)

var pluginGetter plugingetter.PluginGetter

const extName = "LogDriver"

// logPlugin defines the available functions that logging plugins must implement.
type logPlugin interface {
        StartLogging(streamPath string, info Info) (err error)
        StopLogging(streamPath string) (err error)
        Capabilities() (capability Capability, err error)
        ReadLogs(info Info, config ReadConfig) (stream io.ReadCloser, err error)
}

// RegisterPluginGetter sets the plugingetter
func RegisterPluginGetter(g plugingetter.PluginGetter) {
        pluginGetter = g
}

// getPlugin returns a logging driver by its name.
// If the driver is empty, it looks for the local driver.
func getPlugin(name string, mode int) (Creator, error) {
        p, err := pluginGetter.Get(name, extName, mode)
        if err != nil {
                return nil, fmt.Errorf("error looking up logging plugin %s: %v", name, err)
        }

        client, err := makePluginClient(p)
        if err != nil {
                return nil, err
        }
        return makePluginCreator(name, client, p.ScopedPath), nil
}

func makePluginClient(p plugingetter.CompatPlugin) (logPlugin, error) {
        if pc, ok := p.(plugingetter.PluginWithV1Client); ok {
                return &logPluginProxy{pc.Client()}, nil
        }
        pa, ok := p.(plugingetter.PluginAddr)
        if !ok {
                return nil, errdefs.System(errors.Errorf("got unknown plugin type %T", p))
        }

        if pa.Protocol() != plugins.ProtocolSchemeHTTPV1 {
                return nil, errors.Errorf("plugin protocol not supported: %s", p)
        }

        addr := pa.Addr()
        c, err := plugins.NewClientWithTimeout(addr.Network()+"://"+addr.String(), nil, pa.Timeout())
        if err != nil {
                return nil, errors.Wrap(err, "error making plugin client")
        }
        return &logPluginProxy{c}, nil
}

func makePluginCreator(name string, l logPlugin, scopePath func(s string) string) Creator {
        return func(logCtx Info) (logger Logger, retErr error) {
                defer func() {
                        if retErr != nil {
                                _, _ = pluginGetter.Get(name, extName, plugingetter.Release)
                        }
                }()

                unscopedPath := filepath.Join("/", "run", "docker", "logging")
                logRoot := scopePath(unscopedPath)
                if err := os.MkdirAll(logRoot, 0o700); err != nil {
                        return nil, err
                }

                id := stringid.GenerateRandomID()
                a := &pluginAdapter{
                        driverName: name,
                        id:         id,
                        plugin:     l,
                        fifoPath:   filepath.Join(logRoot, id),
                        logInfo:    logCtx,
                }

                caps, err := a.plugin.Capabilities()
                if err == nil {
                        a.capabilities = caps
                }

                stream, err := openPluginStream(a)
                if err != nil {
                        return nil, err
                }

                a.stream = stream
                a.enc = logdriver.NewLogEntryEncoder(a.stream)

                if err := l.StartLogging(filepath.Join(unscopedPath, id), logCtx); err != nil {
                        return nil, errors.Wrapf(err, "error creating logger")
                }

                if caps.ReadLogs {
                        return &pluginAdapterWithRead{a}, nil
                }

                return a, nil
        }
}

//go:build linux || freebsd

package logger

import (
        "context"
        "io"

        "github.com/containerd/fifo"
        "github.com/pkg/errors"
        "golang.org/x/sys/unix"
)

func openPluginStream(a *pluginAdapter) (io.WriteCloser, error) {
        // Make sure to also open with read (in addition to write) to avoid broken pipe errors on plugin failure.
        // It is up to the plugin to keep track of pipes that it should re-attach to, however.
        // If the plugin doesn't open for reads, then the container will block once the pipe is full.
        f, err := fifo.OpenFifo(context.Background(), a.fifoPath, unix.O_RDWR|unix.O_CREAT|unix.O_NONBLOCK, 0o700)
        if err != nil {
                return nil, errors.Wrapf(err, "error creating i/o pipe for log plugin: %s", a.Name())
        }
        return f, nil
}

package logger

import (
        "errors"
        "io"
)

type client interface {
        Call(string, interface{}, interface{}) error
        Stream(string, interface{}) (io.ReadCloser, error)
}

type logPluginProxy struct {
        client
}

type logPluginProxyStartLoggingRequest struct {
        File string
        Info Info
}

type logPluginProxyStartLoggingResponse struct {
        Err string
}

func (pp *logPluginProxy) StartLogging(file string, info Info) (err error) {
        var (
                req logPluginProxyStartLoggingRequest
                ret logPluginProxyStartLoggingResponse
        )

        req.File = file
        req.Info = info
        if err = pp.Call("LogDriver.StartLogging", req, &ret); err != nil {
                return err
        }

        if ret.Err != "" {
                return errors.New(ret.Err)
        }

        return nil
}

type logPluginProxyStopLoggingRequest struct {
        File string
}

type logPluginProxyStopLoggingResponse struct {
        Err string
}

func (pp *logPluginProxy) StopLogging(file string) (err error) {
        var (
                req logPluginProxyStopLoggingRequest
                ret logPluginProxyStopLoggingResponse
        )

        req.File = file
        if err = pp.Call("LogDriver.StopLogging", req, &ret); err != nil {
                return err
        }

        if ret.Err != "" {
                return errors.New(ret.Err)
        }

        return nil
}

type logPluginProxyCapabilitiesResponse struct {
        Cap Capability
        Err string
}

func (pp *logPluginProxy) Capabilities() (Capability, error) {
        var ret logPluginProxyCapabilitiesResponse
        if err := pp.Call("LogDriver.Capabilities", nil, &ret); err != nil {
                return Capability{}, err
        }

        if ret.Err != "" {
                return Capability{}, errors.New(ret.Err)
        }

        return ret.Cap, nil
}

type logPluginProxyReadLogsRequest struct {
        Info   Info
        Config ReadConfig
}

func (pp *logPluginProxy) ReadLogs(info Info, config ReadConfig) (stream io.ReadCloser, _ error) {
        return pp.Stream("LogDriver.ReadLogs", logPluginProxyReadLogsRequest{
                Info:   info,
                Config: config,
        })
}

package logger

import (
        "context"
        "errors"
        "sync"
        "sync/atomic"
)

const (
        defaultRingMaxSize = 1e6 // 1MB
)

// ringLogger is a ring buffer that implements the Logger interface.
// This is used when lossy logging is OK.
type ringLogger struct {
        buffer    *messageRing
        l         Logger
        logInfo   Info
        closeFlag atomic.Bool
        wg        sync.WaitGroup
}

var (
        _ SizedLogger = (*ringLogger)(nil)
        _ LogReader   = (*ringWithReader)(nil)
)

type ringWithReader struct {
        *ringLogger
}

func (r *ringWithReader) ReadLogs(ctx context.Context, cfg ReadConfig) *LogWatcher {
        reader, ok := r.l.(LogReader)
        if !ok {
                // something is wrong if we get here
                panic("expected log reader")
        }
        return reader.ReadLogs(ctx, cfg)
}

func newRingLogger(driver Logger, logInfo Info, maxSize int64) *ringLogger {
        l := &ringLogger{
                buffer:  newRing(maxSize),
                l:       driver,
                logInfo: logInfo,
        }
        l.wg.Add(1)
        go l.run()
        return l
}

// NewRingLogger creates a new Logger that is implemented as a RingBuffer wrapping
// the passed in logger.
func NewRingLogger(driver Logger, logInfo Info, maxSize int64) Logger {
        if maxSize < 0 {
                maxSize = defaultRingMaxSize
        }
        l := newRingLogger(driver, logInfo, maxSize)
        if _, ok := driver.(LogReader); ok {
                return &ringWithReader{l}
        }
        return l
}

// BufSize returns the buffer size of the underlying logger.
// Returns -1 if the logger doesn't match SizedLogger interface.
func (r *ringLogger) BufSize() int {
        if sl, ok := r.l.(SizedLogger); ok {
                return sl.BufSize()
        }
        return -1
}

// Log queues messages into the ring buffer
func (r *ringLogger) Log(msg *Message) error {
        if r.closed() {
                return errClosed
        }
        return r.buffer.Enqueue(msg)
}

// Name returns the name of the underlying logger
func (r *ringLogger) Name() string {
        return r.l.Name()
}

func (r *ringLogger) closed() bool {
        return r.closeFlag.Load()
}

func (r *ringLogger) setClosed() {
        r.closeFlag.Store(true)
}

// Close closes the logger
func (r *ringLogger) Close() error {
        r.setClosed()
        r.buffer.Close()
        r.wg.Wait()
        // empty out the queue
        var logErr bool
        for _, msg := range r.buffer.Drain() {
                if logErr {
                        // some error logging a previous message, so re-insert to message pool
                        // and assume log driver is hosed
                        PutMessage(msg)
                        continue
                }

                if err := r.l.Log(msg); err != nil {
                        logDriverError(r.l.Name(), string(msg.Line), err)
                        logErr = true
                }
        }
        return r.l.Close()
}

// run consumes messages from the ring buffer and forwards them to the underling
// logger.
// This is run in a goroutine when the ringLogger is created
func (r *ringLogger) run() {
        defer r.wg.Done()
        for {
                if r.closed() {
                        return
                }
                msg, err := r.buffer.Dequeue()
                if err != nil {
                        // buffer is closed
                        return
                }
                if err := r.l.Log(msg); err != nil {
                        logDriverError(r.l.Name(), string(msg.Line), err)
                }
        }
}

type messageRing struct {
        mu sync.Mutex
        // signals callers of `Dequeue` to wake up either on `Close` or when a new `Message` is added
        wait *sync.Cond

        sizeBytes int64 // current buffer size
        maxBytes  int64 // max buffer size
        queue     []*Message
        closed    bool
}

func newRing(maxBytes int64) *messageRing {
        queueSize := 1000
        if maxBytes == 0 || maxBytes == 1 {
                // With 0 or 1 max byte size, the maximum size of the queue would only ever be 1
                // message long.
                queueSize = 1
        }

        r := &messageRing{queue: make([]*Message, 0, queueSize), maxBytes: maxBytes}
        r.wait = sync.NewCond(&r.mu)
        return r
}

// Enqueue adds a message to the buffer queue
// If the message is too big for the buffer it drops the new message.
// If there are no messages in the queue and the message is still too big, it adds the message anyway.
func (r *messageRing) Enqueue(m *Message) error {
        mSize := int64(len(m.Line))

        r.mu.Lock()
        if r.closed {
                r.mu.Unlock()
                return errClosed
        }
        if mSize+r.sizeBytes > r.maxBytes && len(r.queue) > 0 {
                r.wait.Signal()
                r.mu.Unlock()
                return nil
        }

        r.queue = append(r.queue, m)
        r.sizeBytes += mSize
        r.wait.Signal()
        r.mu.Unlock()
        return nil
}

// Dequeue pulls a message off the queue
// If there are no messages, it waits for one.
// If the buffer is closed, it will return immediately.
func (r *messageRing) Dequeue() (*Message, error) {
        r.mu.Lock()
        for len(r.queue) == 0 && !r.closed {
                r.wait.Wait()
        }

        if r.closed {
                r.mu.Unlock()
                return nil, errClosed
        }

        msg := r.queue[0]
        r.queue = r.queue[1:]
        r.sizeBytes -= int64(len(msg.Line))
        r.mu.Unlock()
        return msg, nil
}

var errClosed = errors.New("closed")

// Close closes the buffer ensuring no new messages can be added.
// Any callers waiting to dequeue a message will be woken up.
func (r *messageRing) Close() {
        r.mu.Lock()
        if r.closed {
                r.mu.Unlock()
                return
        }

        r.closed = true
        r.wait.Broadcast()
        r.mu.Unlock()
}

// Drain drains all messages from the queue.
// This can be used after `Close()` to get any remaining messages that were in queue.
func (r *messageRing) Drain() []*Message {
        r.mu.Lock()
        ls := make([]*Message, 0, len(r.queue))
        ls = append(ls, r.queue...)
        r.sizeBytes = 0
        r.queue = r.queue[:0]
        r.mu.Unlock()
        return ls
}

package templates

import (
        "bytes"
        "encoding/json"
        "strings"
        "text/template"
)

// basicFunctions are the set of initial
// functions provided to every template.
var basicFunctions = template.FuncMap{
        "json": func(v interface{}) string {
                buf := &bytes.Buffer{}
                enc := json.NewEncoder(buf)
                enc.SetEscapeHTML(false)
                _ = enc.Encode(v) //nolint:errchkjson // ignore "Error return json.Encoder.Encode` is not checked"
                // Remove the trailing new line added by the encoder
                return strings.TrimSpace(buf.String())
        },
        "split":    strings.Split,
        "join":     strings.Join,
        "title":    strings.Title, //nolint:staticcheck // SA1019: strings.Title is deprecated: The rule Title uses for word boundaries does not handle Unicode punctuation properly. Use golang.org/x/text/cases instead.
        "lower":    strings.ToLower,
        "upper":    strings.ToUpper,
        "pad":      padWithSpace,
        "truncate": truncateWithLength,
}

// NewParse creates a new tagged template with the basic functions
// and parses the given format.
func NewParse(tag, format string) (*template.Template, error) {
        return template.New(tag).Funcs(basicFunctions).Parse(format)
}

// padWithSpace adds whitespace to the input if the input is non-empty
func padWithSpace(source string, prefix, suffix int) string {
        if source == "" {
                return source
        }
        return strings.Repeat(" ", prefix) + source + strings.Repeat(" ", suffix)
}

// truncateWithLength truncates the source string up to the length provided by the input
func truncateWithLength(source string, length int) string {
        if len(source) < length {
                return source
        }
        return source[:length]
}

package network

import (
        "github.com/docker/docker/errdefs"
        "github.com/moby/moby/api/types/filters"
        "github.com/moby/moby/api/types/network"
        "github.com/pkg/errors"
)

// FilterNetworks filters network list according to user specified filter
// and returns user chosen networks
func FilterNetworks(nws []network.Inspect, filter filters.Args) ([]network.Inspect, error) {
        // if filter is empty, return original network list
        if filter.Len() == 0 {
                return nws, nil
        }

        displayNet := nws[:0]
        for _, nw := range nws {
                if filter.Contains("driver") {
                        if !filter.ExactMatch("driver", nw.Driver) {
                                continue
                        }
                }
                if filter.Contains("name") {
                        if !filter.Match("name", nw.Name) {
                                continue
                        }
                }
                if filter.Contains("id") {
                        if !filter.Match("id", nw.ID) {
                                continue
                        }
                }
                if filter.Contains("label") {
                        if !filter.MatchKVList("label", nw.Labels) {
                                continue
                        }
                }
                if filter.Contains("scope") {
                        if !filter.ExactMatch("scope", nw.Scope) {
                                continue
                        }
                }

                if filter.Contains("idOrName") {
                        if !filter.Match("name", nw.Name) && !filter.Match("id", nw.Name) {
                                continue
                        }
                }
                displayNet = append(displayNet, nw)
        }

        if values := filter.Get("dangling"); len(values) > 0 {
                if len(values) > 1 {
                        return nil, errdefs.InvalidParameter(errors.New(`got more than one value for filter key "dangling"`))
                }

                var danglingOnly bool
                switch values[0] {
                case "0", "false":
                        // dangling is false already
                case "1", "true":
                        danglingOnly = true
                default:
                        return nil, errdefs.InvalidParameter(errors.New(`invalid value for filter 'dangling', must be "true" (or "1"), or "false" (or "0")`))
                }

                displayNet = filterNetworkByUse(displayNet, danglingOnly)
        }

        if filter.Contains("type") {
                typeNet := []network.Inspect{}
                errFilter := filter.WalkValues("type", func(fval string) error {
                        passList, err := filterNetworkByType(displayNet, fval)
                        if err != nil {
                                return err
                        }
                        typeNet = append(typeNet, passList...)
                        return nil
                })
                if errFilter != nil {
                        return nil, errFilter
                }
                displayNet = typeNet
        }

        return displayNet, nil
}

func filterNetworkByUse(nws []network.Inspect, danglingOnly bool) []network.Inspect {
        retNws := []network.Inspect{}

        filterFunc := func(nw network.Inspect) bool {
                if danglingOnly {
                        return !IsPredefined(nw.Name) && len(nw.Containers) == 0 && len(nw.Services) == 0
                }
                return IsPredefined(nw.Name) || len(nw.Containers) > 0 || len(nw.Services) > 0
        }

        for _, nw := range nws {
                if filterFunc(nw) {
                        retNws = append(retNws, nw)
                }
        }

        return retNws
}

func filterNetworkByType(nws []network.Inspect, netType string) ([]network.Inspect, error) {
        retNws := []network.Inspect{}
        switch netType {
        case "builtin":
                for _, nw := range nws {
                        if IsPredefined(nw.Name) {
                                retNws = append(retNws, nw)
                        }
                }
        case "custom":
                for _, nw := range nws {
                        if !IsPredefined(nw.Name) {
                                retNws = append(retNws, nw)
                        }
                }
        default:
                return nil, errors.Errorf("invalid filter: 'type'='%s'", netType)
        }
        return retNws, nil
}

package network

// DefaultNetwork is the name of the default network driver to use for containers
// on the daemon platform. The default for Linux containers is "bridge"
// ([network.NetworkBridge]), and "nat" ([network.NetworkNat]) for Windows
// containers.
const DefaultNetwork = defaultNetwork

// IsPredefined indicates if a network is predefined by the daemon.
func IsPredefined(network string) bool {
        // TODO(thaJeztah): check if we can align the check for both platforms
        return isPreDefined(network)
}

//go:build !windows

package network

import (
        "github.com/moby/moby/api/types/container"
        "github.com/moby/moby/api/types/network"
)

const defaultNetwork = network.NetworkBridge

func isPreDefined(network string) bool {
        n := container.NetworkMode(network)
        return n.IsBridge() || n.IsHost() || n.IsNone() || n.IsDefault()
}

package network

import (
        "net"
        "sync"

        clustertypes "github.com/docker/docker/daemon/cluster/provider"
        "github.com/docker/go-connections/nat"
        networktypes "github.com/moby/moby/api/types/network"
        "github.com/pkg/errors"
)

// Settings stores configuration details about the daemon network config
// TODO Windows. Many of these fields can be factored out.,
type Settings struct {
        Bridge                 string
        SandboxID              string
        SandboxKey             string
        HairpinMode            bool
        LinkLocalIPv6Address   string
        LinkLocalIPv6PrefixLen int
        Networks               map[string]*EndpointSettings
        Service                *clustertypes.ServiceConfig
        Ports                  nat.PortMap
        SecondaryIPAddresses   []networktypes.Address
        SecondaryIPv6Addresses []networktypes.Address
        HasSwarmEndpoint       bool
}

// EndpointSettings is a package local wrapper for
// networktypes.EndpointSettings which stores Endpoint state that
// needs to be persisted to disk but not exposed in the api.
type EndpointSettings struct {
        *networktypes.EndpointSettings
        IPAMOperational bool
        // DesiredMacAddress is the configured value, it's copied from MacAddress (the
        // API param field) when the container is created.
        DesiredMacAddress string
}

// AttachmentStore stores the load balancer IP address for a network id.
type AttachmentStore struct {
        sync.Mutex
        // key: networkd id
        // value: load balancer ip address
        networkToNodeLBIP map[string]net.IP
}

// ResetAttachments clears any existing load balancer IP to network mapping and
// sets the mapping to the given attachments.
func (store *AttachmentStore) ResetAttachments(attachments map[string]string) error {
        store.Lock()
        defer store.Unlock()
        store.clearAttachments()
        for nid, nodeIP := range attachments {
                ip, _, err := net.ParseCIDR(nodeIP)
                if err != nil {
                        store.networkToNodeLBIP = make(map[string]net.IP)
                        return errors.Wrapf(err, "Failed to parse load balancer address %s", nodeIP)
                }
                store.networkToNodeLBIP[nid] = ip
        }
        return nil
}

// ClearAttachments clears all the mappings of network to load balancer IP Address.
func (store *AttachmentStore) ClearAttachments() {
        store.Lock()
        defer store.Unlock()
        store.clearAttachments()
}

func (store *AttachmentStore) clearAttachments() {
        store.networkToNodeLBIP = make(map[string]net.IP)
}

// GetIPForNetwork return the load balancer IP address for the given network.
func (store *AttachmentStore) GetIPForNetwork(networkID string) (net.IP, bool) {
        store.Lock()
        defer store.Unlock()
        ip, exists := store.networkToNodeLBIP[networkID]
        return ip, exists
}

package opts

import (
        "encoding/csv"
        "encoding/json"
        "fmt"
        "net/netip"
        "strconv"
        "strings"

        "github.com/docker/docker/daemon/libnetwork/ipamutils"
)

// PoolsOpt is a Value type for parsing the default address pools definitions
type PoolsOpt struct {
        Values []*ipamutils.NetworkToSplit
}

// UnmarshalJSON fills values structure  info from JSON input
func (p *PoolsOpt) UnmarshalJSON(raw []byte) error {
        return json.Unmarshal(raw, &(p.Values))
}

// Set predefined pools
func (p *PoolsOpt) Set(value string) error {
        csvReader := csv.NewReader(strings.NewReader(value))
        fields, err := csvReader.Read()
        if err != nil {
                return err
        }

        poolsDef := ipamutils.NetworkToSplit{}

        for _, field := range fields {
                // TODO(thaJeztah): this should not be case-insensitive.
                key, val, ok := strings.Cut(strings.ToLower(field), "=")
                if !ok {
                        return fmt.Errorf("invalid field '%s' must be a key=value pair", field)
                }

                switch key {
                case "base":
                        base, err := netip.ParsePrefix(val)
                        if err != nil {
                                return fmt.Errorf("invalid base prefix %q: %w", val, err)
                        }
                        poolsDef.Base = base
                case "size":
                        size, err := strconv.Atoi(val)
                        if err != nil {
                                return fmt.Errorf("invalid size value: %q (must be integer): %v", value, err)
                        }
                        poolsDef.Size = size
                default:
                        return fmt.Errorf("unexpected key '%s' in '%s'", key, field)
                }
        }

        p.Values = append(p.Values, &poolsDef)

        return nil
}

// Type returns the type of this option
func (p *PoolsOpt) Type() string {
        return "pool-options"
}

// String returns a string repr of this option
func (p *PoolsOpt) String() string {
        var pools []string
        for _, pool := range p.Values {
                repr := fmt.Sprintf("%s %d", pool.Base, pool.Size)
                pools = append(pools, repr)
        }
        return strings.Join(pools, ", ")
}

// Value returns the mounts
func (p *PoolsOpt) Value() []*ipamutils.NetworkToSplit {
        return p.Values
}

// Name returns the flag name of this option
func (p *PoolsOpt) Name() string {
        return "default-address-pools"
}

package opts

import (
        "os"
        "strings"

        "github.com/pkg/errors"
)

// ValidateEnv validates an environment variable and returns it.
// If no value is specified, it obtains its value from the current environment
//
// As on ParseEnvFile and related to #16585, environment variable names
// are not validate whatsoever, it's up to application inside docker
// to validate them or not.
//
// The only validation here is to check if name is empty, per #25099
func ValidateEnv(val string) (string, error) {
        k, _, ok := strings.Cut(val, "=")
        if k == "" {
                return "", errors.New("invalid environment variable: " + val)
        }
        if ok {
                return val, nil
        }
        if envVal, ok := os.LookupEnv(k); ok {
                return k + "=" + envVal, nil
        }
        return val, nil
}

package opts

import (
        "net"
        "net/url"
        "path/filepath"
        "strconv"
        "strings"

        "github.com/docker/docker/pkg/homedir"
        "github.com/pkg/errors"
)

const (
        // DefaultHTTPPort Default HTTP Port used if only the protocol is provided to -H flag e.g. dockerd -H tcp://
        // These are the IANA registered port numbers for use with Docker
        // see http://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml?search=docker
        DefaultHTTPPort = 2375 // Default HTTP Port
        // DefaultTLSHTTPPort Default HTTP Port used when TLS enabled
        DefaultTLSHTTPPort = 2376 // Default TLS encrypted HTTP Port
        // DefaultUnixSocket Path for the unix socket.
        // Docker daemon by default always listens on the default unix socket
        DefaultUnixSocket = "/var/run/docker.sock"
        // DefaultTCPHost constant defines the default host string used by docker on Windows
        DefaultTCPHost = "tcp://" + DefaultHTTPHost + ":2375"
        // DefaultTLSHost constant defines the default host string used by docker for TLS sockets
        DefaultTLSHost = "tcp://" + DefaultHTTPHost + ":2376"
        // DefaultNamedPipe defines the default named pipe used by docker on Windows
        DefaultNamedPipe = `//./pipe/docker_engine`
        // HostGatewayName is the string value that can be passed
        // to the IPAddr section in --add-host that is replaced by
        // the value of HostGatewayIP daemon config value
        HostGatewayName = "host-gateway"
)

// ValidateHost validates that the specified string is a valid host and returns it.
func ValidateHost(val string) (string, error) {
        host := strings.TrimSpace(val)
        // The empty string means default and is not handled by parseDaemonHost
        if host != "" {
                _, err := parseDaemonHost(host)
                if err != nil {
                        return val, err
                }
        }
        // Note: unlike most flag validators, we don't return the mutated value here
        //       we need to know what the user entered later (using ParseHost) to adjust for TLS
        return val, nil
}

// ParseHost and set defaults for a Daemon host string.
// defaultToTLS is preferred over defaultToUnixXDG.
func ParseHost(defaultToTLS, defaultToUnixXDG bool, val string) (string, error) {
        host := strings.TrimSpace(val)
        if host == "" {
                if defaultToTLS {
                        host = DefaultTLSHost
                } else if defaultToUnixXDG {
                        runtimeDir, err := homedir.GetRuntimeDir()
                        if err != nil {
                                return "", err
                        }
                        host = "unix://" + filepath.Join(runtimeDir, "docker.sock")
                } else {
                        host = DefaultHost
                }
        } else {
                var err error
                host, err = parseDaemonHost(host)
                if err != nil {
                        return val, err
                }
        }
        return host, nil
}

// parseDaemonHost parses the specified address and returns an address that will be used as the host.
// Depending on the address specified, this may return one of the global Default* strings defined in hosts.go.
func parseDaemonHost(address string) (string, error) {
        proto, addr, ok := strings.Cut(address, "://")
        if !ok && proto != "" {
                addr = proto
                proto = "tcp"
        }

        switch proto {
        case "tcp":
                return ParseTCPAddr(address, DefaultTCPHost)
        case "unix":
                a, err := parseSimpleProtoAddr(proto, addr, DefaultUnixSocket)
                if err != nil {
                        return "", errors.Wrapf(err, "invalid bind address (%s)", address)
                }
                return a, nil
        case "npipe":
                a, err := parseSimpleProtoAddr(proto, addr, DefaultNamedPipe)
                if err != nil {
                        return "", errors.Wrapf(err, "invalid bind address (%s)", address)
                }
                return a, nil
        case "fd":
                return address, nil
        default:
                return "", errors.Errorf("invalid bind address (%s): unsupported proto '%s'", address, proto)
        }
}

// parseSimpleProtoAddr parses and validates that the specified address is a valid
// socket address for simple protocols like unix and npipe. It returns a formatted
// socket address, either using the address parsed from addr, or the contents of
// defaultAddr if addr is a blank string.
func parseSimpleProtoAddr(proto, addr, defaultAddr string) (string, error) {
        if strings.Contains(addr, "://") {
                return "", errors.Errorf("invalid %s address: %s", proto, addr)
        }
        if addr == "" {
                addr = defaultAddr
        }
        return proto + "://" + addr, nil
}

// ParseTCPAddr parses and validates that the specified address is a valid TCP
// address. It returns a formatted TCP address, either using the address parsed
// from tryAddr, or the contents of defaultAddr if tryAddr is a blank string.
// tryAddr is expected to have already been Trim()'d
// defaultAddr must be in the full `tcp://host:port` form
func ParseTCPAddr(tryAddr string, defaultAddr string) (string, error) {
        def, err := parseTCPAddr(defaultAddr, true)
        if err != nil {
                return "", errors.Wrapf(err, "invalid default address (%s)", defaultAddr)
        }

        addr, err := parseTCPAddr(tryAddr, false)
        if err != nil {
                return "", errors.Wrapf(err, "invalid bind address (%s)", tryAddr)
        }

        host := addr.Hostname()
        if host == "" {
                host = def.Hostname()
        }
        port := addr.Port()
        if port == "" {
                port = def.Port()
        }

        return "tcp://" + net.JoinHostPort(host, port), nil
}

// parseTCPAddr parses the given addr and validates if it is in the expected
// format. If strict is enabled, the address must contain a scheme (tcp://),
// a host (or IP-address) and a port number.
func parseTCPAddr(address string, strict bool) (*url.URL, error) {
        if !strict && !strings.Contains(address, "://") {
                address = "tcp://" + address
        }
        parsedURL, err := url.Parse(address)
        if err != nil {
                return nil, err
        }
        if parsedURL.Scheme != "tcp" {
                return nil, errors.Errorf("unsupported proto '%s'", parsedURL.Scheme)
        }
        if parsedURL.Path != "" {
                return nil, errors.New("should not contain a path element")
        }
        if strict && parsedURL.Host == "" {
                return nil, errors.New("no host or IP address")
        }
        if parsedURL.Port() != "" || strict {
                if p, err := strconv.Atoi(parsedURL.Port()); err != nil || p == 0 {
                        return nil, errors.Errorf("invalid port: %s", parsedURL.Port())
                }
        }
        return parsedURL, nil
}

// ValidateExtraHost validates that the specified string is a valid extrahost and returns it.
// ExtraHost is in the form of name:ip where the ip has to be a valid ip (IPv4 or IPv6).
func ValidateExtraHost(val string) (string, error) {
        // allow for IPv6 addresses in extra hosts by only splitting on first ":"
        name, ip, ok := strings.Cut(val, ":")
        if !ok || name == "" {
                return "", errors.Errorf("bad format for add-host: %q", val)
        }
        // Skip IPaddr validation for special "host-gateway" string
        if ip != HostGatewayName {
                if _, err := ValidateIPAddress(ip); err != nil {
                        return "", errors.Errorf("invalid IP address in add-host: %q", ip)
                }
        }
        return val, nil
}

package opts

import (
        "errors"
        "fmt"
        "net"
        "path"
        "strings"

        "github.com/docker/docker/internal/lazyregexp"
        "github.com/docker/go-units"
)

var (
        alphaRegexp  = lazyregexp.New(`[a-zA-Z]`)
        domainRegexp = lazyregexp.New(`^(:?(:?[a-zA-Z0-9]|(:?[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9]))(:?\.(:?[a-zA-Z0-9]|(:?[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9])))*)\.?\s*$`)
)

// ListOpts holds a list of values and a validation function.
type ListOpts struct {
        values    *[]string
        validator ValidatorFctType
}

// NewListOpts creates a new ListOpts with the specified validator.
func NewListOpts(validator ValidatorFctType) ListOpts {
        var values []string
        return *NewListOptsRef(&values, validator)
}

// NewListOptsRef creates a new ListOpts with the specified values and validator.
func NewListOptsRef(values *[]string, validator ValidatorFctType) *ListOpts {
        return &ListOpts{
                values:    values,
                validator: validator,
        }
}

func (opts *ListOpts) String() string {
        if len(*opts.values) == 0 {
                return ""
        }
        return fmt.Sprintf("%v", *opts.values)
}

// Set validates if needed the input value and adds it to the
// internal slice.
func (opts *ListOpts) Set(value string) error {
        if opts.validator != nil {
                v, err := opts.validator(value)
                if err != nil {
                        return err
                }
                value = v
        }
        *opts.values = append(*opts.values, value)
        return nil
}

// Delete removes the specified element from the slice.
func (opts *ListOpts) Delete(key string) {
        for i, k := range *opts.values {
                if k == key {
                        *opts.values = append((*opts.values)[:i], (*opts.values)[i+1:]...)
                        return
                }
        }
}

// GetMap returns the content of values in a map in order to avoid
// duplicates.
func (opts *ListOpts) GetMap() map[string]struct{} {
        ret := make(map[string]struct{})
        for _, k := range *opts.values {
                ret[k] = struct{}{}
        }
        return ret
}

// GetAll returns the values of slice.
func (opts *ListOpts) GetAll() []string {
        return *opts.values
}

// GetAllOrEmpty returns the values of the slice
// or an empty slice when there are no values.
func (opts *ListOpts) GetAllOrEmpty() []string {
        v := *opts.values
        if v == nil {
                return make([]string, 0)
        }
        return v
}

// Get checks the existence of the specified key.
func (opts *ListOpts) Get(key string) bool {
        for _, k := range *opts.values {
                if k == key {
                        return true
                }
        }
        return false
}

// Len returns the amount of element in the slice.
func (opts *ListOpts) Len() int {
        return len(*opts.values)
}

// Type returns a string name for this Option type
func (opts *ListOpts) Type() string {
        return "list"
}

// WithValidator returns the ListOpts with validator set.
func (opts *ListOpts) WithValidator(validator ValidatorFctType) *ListOpts {
        opts.validator = validator
        return opts
}

// NamedOption is an interface that list and map options
// with names implement.
type NamedOption interface {
        Name() string
}

// NamedListOpts is a ListOpts with a configuration name.
// This struct is useful to keep reference to the assigned
// field name in the internal configuration struct.
type NamedListOpts struct {
        name string
        ListOpts
}

var _ NamedOption = &NamedListOpts{}

// NewNamedListOptsRef creates a reference to a new NamedListOpts struct.
func NewNamedListOptsRef(name string, values *[]string, validator ValidatorFctType) *NamedListOpts {
        return &NamedListOpts{
                name:     name,
                ListOpts: *NewListOptsRef(values, validator),
        }
}

// Name returns the name of the NamedListOpts in the configuration.
func (o *NamedListOpts) Name() string {
        return o.name
}

// NamedMapMapOpts is a MapMapOpts with a configuration name.
// This struct is useful to keep reference to the assigned
// field name in the internal configuration struct.
type NamedMapMapOpts struct {
        name string
        MapMapOpts
}

// NewNamedMapMapOpts creates a reference to a new NamedMapOpts struct.
func NewNamedMapMapOpts(name string, values map[string]map[string]string, validator ValidatorFctType) *NamedMapMapOpts {
        return &NamedMapMapOpts{
                name:       name,
                MapMapOpts: *NewMapMapOpts(values, validator),
        }
}

// Name returns the name of the NamedListOpts in the configuration.
func (o *NamedMapMapOpts) Name() string {
        return o.name
}

// MapMapOpts holds a map of maps of values and a validation function.
type MapMapOpts struct {
        values    map[string]map[string]string
        validator ValidatorFctType
}

// Set validates if needed the input value and add it to the
// internal map, by splitting on '='.
func (opts *MapMapOpts) Set(value string) error {
        if opts.validator != nil {
                v, err := opts.validator(value)
                if err != nil {
                        return err
                }
                value = v
        }
        rk, rv, found := strings.Cut(value, "=")
        if !found {
                return fmt.Errorf("invalid value %q for map option, should be root-key=key=value", value)
        }
        k, v, found := strings.Cut(rv, "=")
        if !found {
                return fmt.Errorf("invalid value %q for map option, should be root-key=key=value", value)
        }
        if _, ok := opts.values[rk]; !ok {
                opts.values[rk] = make(map[string]string)
        }
        opts.values[rk][k] = v
        return nil
}

// GetAll returns the values of MapOpts as a map.
func (opts *MapMapOpts) GetAll() map[string]map[string]string {
        return opts.values
}

func (opts *MapMapOpts) String() string {
        return fmt.Sprintf("%v", opts.values)
}

// Type returns a string name for this Option type
func (opts *MapMapOpts) Type() string {
        return "mapmap"
}

// NewMapMapOpts creates a new MapMapOpts with the specified map of values and a validator.
func NewMapMapOpts(values map[string]map[string]string, validator ValidatorFctType) *MapMapOpts {
        if values == nil {
                values = make(map[string]map[string]string)
        }
        return &MapMapOpts{
                values:    values,
                validator: validator,
        }
}

// MapOpts holds a map of values and a validation function.
type MapOpts struct {
        values    map[string]string
        validator ValidatorFctType
}

// Set validates if needed the input value and add it to the
// internal map, by splitting on '='.
func (opts *MapOpts) Set(value string) error {
        if opts.validator != nil {
                v, err := opts.validator(value)
                if err != nil {
                        return err
                }
                value = v
        }
        k, v, _ := strings.Cut(value, "=")
        (opts.values)[k] = v
        return nil
}

// GetAll returns the values of MapOpts as a map.
func (opts *MapOpts) GetAll() map[string]string {
        return opts.values
}

func (opts *MapOpts) String() string {
        return fmt.Sprintf("%v", opts.values)
}

// Type returns a string name for this Option type
func (opts *MapOpts) Type() string {
        return "map"
}

// NewMapOpts creates a new MapOpts with the specified map of values and a validator.
func NewMapOpts(values map[string]string, validator ValidatorFctType) *MapOpts {
        if values == nil {
                values = make(map[string]string)
        }
        return &MapOpts{
                values:    values,
                validator: validator,
        }
}

// NamedMapOpts is a MapOpts struct with a configuration name.
// This struct is useful to keep reference to the assigned
// field name in the internal configuration struct.
type NamedMapOpts struct {
        name string
        MapOpts
}

var _ NamedOption = &NamedMapOpts{}

// NewNamedMapOpts creates a reference to a new NamedMapOpts struct.
func NewNamedMapOpts(name string, values map[string]string, validator ValidatorFctType) *NamedMapOpts {
        return &NamedMapOpts{
                name:    name,
                MapOpts: *NewMapOpts(values, validator),
        }
}

// Name returns the name of the NamedMapOpts in the configuration.
func (o *NamedMapOpts) Name() string {
        return o.name
}

// ValidatorFctType defines a validator function that returns a validated string and/or an error.
type ValidatorFctType func(val string) (string, error)

// ValidatorFctListType defines a validator function that returns a validated list of string and/or an error
type ValidatorFctListType func(val string) ([]string, error)

// ValidateIPAddress validates if the given value is a correctly formatted
// IP address, and returns the value in normalized form. Leading and trailing
// whitespace is allowed, but it does not allow IPv6 addresses surrounded by
// square brackets ("[::1]").
//
// Refer to [net.ParseIP] for accepted formats.
func ValidateIPAddress(val string) (string, error) {
        ip := net.ParseIP(strings.TrimSpace(val))
        if ip != nil {
                return ip.String(), nil
        }
        return "", fmt.Errorf("IP address is not correctly formatted: %s", val)
}

// ValidateDNSSearch validates domain for resolvconf search configuration.
// A zero length domain is represented by a dot (.).
func ValidateDNSSearch(val string) (string, error) {
        if val = strings.Trim(val, " "); val == "." {
                return val, nil
        }
        return validateDomain(val)
}

func validateDomain(val string) (string, error) {
        if alphaRegexp.FindString(val) == "" {
                return "", fmt.Errorf("%s is not a valid domain", val)
        }
        ns := domainRegexp.FindSubmatch([]byte(val))
        if len(ns) > 0 && len(ns[1]) < 255 {
                return string(ns[1]), nil
        }
        return "", fmt.Errorf("%s is not a valid domain", val)
}

// ValidateLabel validates that the specified string is a valid label,
// it does not use the reserved namespaces com.docker.*, io.docker.*, org.dockerproject.*
// and returns it.
// Labels are in the form on key=value.
func ValidateLabel(val string) (string, error) {
        if strings.Count(val, "=") < 1 {
                return "", fmt.Errorf("bad attribute format: %s", val)
        }

        lowered := strings.ToLower(val)
        if strings.HasPrefix(lowered, "com.docker.") || strings.HasPrefix(lowered, "io.docker.") ||
                strings.HasPrefix(lowered, "org.dockerproject.") {
                return "", fmt.Errorf(
                        "label %s is not allowed: the namespaces com.docker.*, io.docker.*, and org.dockerproject.* are reserved for internal use",
                        val)
        }

        return val, nil
}

// ValidateSingleGenericResource validates that a single entry in the
// generic resource list is valid.
// i.e 'GPU=UID1' is valid however 'GPU:UID1' or 'UID1' isn't
func ValidateSingleGenericResource(val string) (string, error) {
        if strings.Count(val, "=") < 1 {
                return "", fmt.Errorf("invalid node-generic-resource format `%s` expected `name=value`", val)
        }
        return val, nil
}

// ParseLink parses and validates the specified string as a link format (name:alias)
func ParseLink(val string) (string, string, error) {
        if val == "" {
                return "", "", errors.New("empty string specified for links")
        }
        arr := strings.Split(val, ":")
        if len(arr) > 2 {
                return "", "", fmt.Errorf("bad format for links: %s", val)
        }
        if len(arr) == 1 {
                return val, val, nil
        }
        // This is kept because we can actually get a HostConfig with links
        // from an already created container and the format is not `foo:bar`
        // but `/foo:/c1/bar`
        if strings.HasPrefix(arr[0], "/") {
                _, alias := path.Split(arr[1])
                return arr[0][1:], alias, nil
        }
        return arr[0], arr[1], nil
}

// MemBytes is a type for human readable memory bytes (like 128M, 2g, etc)
type MemBytes int64

// String returns the string format of the human readable memory bytes
func (m *MemBytes) String() string {
        // NOTE: In spf13/pflag/flag.go, "0" is considered as "zero value" while "0 B" is not.
        // We return "0" in case value is 0 here so that the default value is hidden.
        // (Sometimes "default 0 B" is actually misleading)
        if m.Value() != 0 {
                return units.BytesSize(float64(m.Value()))
        }
        return "0"
}

// Set sets the value of the MemBytes by passing a string
func (m *MemBytes) Set(value string) error {
        val, err := units.RAMInBytes(value)
        *m = MemBytes(val)
        return err
}

// Type returns the type
func (m *MemBytes) Type() string {
        return "bytes"
}

// Value returns the value in int64
func (m *MemBytes) Value() int64 {
        return int64(*m)
}

// UnmarshalJSON is the customized unmarshaler for MemBytes
func (m *MemBytes) UnmarshalJSON(s []byte) error {
        if len(s) <= 2 || s[0] != '"' || s[len(s)-1] != '"' {
                return fmt.Errorf("invalid size: %q", s)
        }
        val, err := units.RAMInBytes(string(s[1 : len(s)-1]))
        *m = MemBytes(val)
        return err
}

package opts

import (
        "fmt"
        "strings"

        "github.com/moby/moby/api/types/system"
)

// RuntimeOpt defines a map of Runtimes
type RuntimeOpt struct {
        name             string
        stockRuntimeName string
        values           *map[string]system.Runtime
}

// NewNamedRuntimeOpt creates a new RuntimeOpt
func NewNamedRuntimeOpt(name string, ref *map[string]system.Runtime, stockRuntime string) *RuntimeOpt {
        if ref == nil {
                ref = &map[string]system.Runtime{}
        }
        return &RuntimeOpt{name: name, values: ref, stockRuntimeName: stockRuntime}
}

// Name returns the name of the NamedListOpts in the configuration.
func (o *RuntimeOpt) Name() string {
        return o.name
}

// Set validates and updates the list of Runtimes
func (o *RuntimeOpt) Set(val string) error {
        k, v, ok := strings.Cut(val, "=")
        if !ok {
                return fmt.Errorf("invalid runtime argument: %s", val)
        }

        // TODO(thaJeztah): this should not accept spaces.
        k = strings.TrimSpace(k)
        v = strings.TrimSpace(v)
        if k == "" || v == "" {
                return fmt.Errorf("invalid runtime argument: %s", val)
        }

        // TODO(thaJeztah): this should not be case-insensitive.
        k = strings.ToLower(k)
        if k == o.stockRuntimeName {
                return fmt.Errorf("runtime name '%s' is reserved", o.stockRuntimeName)
        }

        if _, ok := (*o.values)[k]; ok {
                return fmt.Errorf("runtime '%s' was already defined", k)
        }

        (*o.values)[k] = system.Runtime{Path: v}

        return nil
}

// String returns Runtime values as a string.
func (o *RuntimeOpt) String() string {
        var out []string
        for k := range *o.values {
                out = append(out, k)
        }

        return fmt.Sprintf("%v", out)
}

// GetMap returns a map of Runtimes (name: path)
func (o *RuntimeOpt) GetMap() map[string]system.Runtime {
        if o.values != nil {
                return *o.values
        }

        return map[string]system.Runtime{}
}

// Type returns the type of the option
func (o *RuntimeOpt) Type() string {
        return "runtime"
}

package opts

import (
        "fmt"

        "github.com/docker/go-units"
        "github.com/moby/moby/api/types/container"
)

// UlimitOpt defines a map of Ulimits
type UlimitOpt struct {
        values *map[string]*container.Ulimit
}

// NewUlimitOpt creates a new UlimitOpt
func NewUlimitOpt(ref *map[string]*container.Ulimit) *UlimitOpt {
        // TODO(thaJeztah): why do we need a map with pointers here?
        if ref == nil {
                ref = &map[string]*container.Ulimit{}
        }
        return &UlimitOpt{ref}
}

// Set validates a Ulimit and sets its name as a key in UlimitOpt
func (o *UlimitOpt) Set(val string) error {
        // FIXME(thaJeztah): these functions also need to be moved over from go-units.
        l, err := units.ParseUlimit(val)
        if err != nil {
                return err
        }

        (*o.values)[l.Name] = l

        return nil
}

// String returns Ulimit values as a string.
func (o *UlimitOpt) String() string {
        var out []string
        for _, v := range *o.values {
                out = append(out, v.String())
        }

        return fmt.Sprintf("%v", out)
}

// GetList returns a slice of pointers to Ulimits.
func (o *UlimitOpt) GetList() []*container.Ulimit {
        var ulimits []*container.Ulimit
        for _, v := range *o.values {
                ulimits = append(ulimits, v)
        }

        return ulimits
}

// Type returns the option type
func (o *UlimitOpt) Type() string {
        return "ulimit"
}

// NamedUlimitOpt defines a named map of Ulimits
type NamedUlimitOpt struct {
        name string
        UlimitOpt
}

var _ NamedOption = &NamedUlimitOpt{}

// NewNamedUlimitOpt creates a new NamedUlimitOpt
func NewNamedUlimitOpt(name string, ref *map[string]*container.Ulimit) *NamedUlimitOpt {
        if ref == nil {
                ref = &map[string]*container.Ulimit{}
        }
        return &NamedUlimitOpt{
                name:      name,
                UlimitOpt: *NewUlimitOpt(ref),
        }
}

// Name returns the option name
func (o *NamedUlimitOpt) Name() string {
        return o.name
}

package plugin

import (
        "archive/tar"
        "bytes"
        "compress/gzip"
        "context"
        "encoding/json"
        "io"
        "net/http"
        "os"
        "path"
        "path/filepath"
        "strings"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        c8dimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/distribution/reference"
        v2 "github.com/docker/docker/daemon/pkg/plugin/v2"
        "github.com/docker/docker/dockerversion"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/internal/containerfs"
        "github.com/docker/docker/pkg/authorization"
        "github.com/docker/docker/pkg/pools"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/go-archive/chrootarchive"
        "github.com/moby/moby/api/types"
        "github.com/moby/moby/api/types/backend"
        "github.com/moby/moby/api/types/events"
        "github.com/moby/moby/api/types/filters"
        "github.com/moby/moby/api/types/registry"
        "github.com/moby/sys/mount"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// MediaTypePluginConfig specifies the mediaType for plugin configuration.
//
// It is a copy of the mediaType defined in [schema2.MediaTypePluginConfig].
//
// [schema2.MediaTypePluginConfig]: https://pkg.go.dev/github.com/distribution/distribution/v3@v3.0.0/manifest/schema2#MediaTypePluginConfig
const MediaTypePluginConfig = "application/vnd.docker.plugin.v1+json"

var acceptedPluginFilterTags = map[string]bool{
        "enabled":    true,
        "capability": true,
}

// Disable deactivates a plugin. This means resources (volumes, networks) cant use them.
func (pm *Manager) Disable(refOrID string, config *backend.PluginDisableConfig) error {
        p, err := pm.config.Store.GetV2Plugin(refOrID)
        if err != nil {
                return err
        }
        pm.mu.RLock()
        c := pm.cMap[p]
        pm.mu.RUnlock()

        if !config.ForceDisable && p.GetRefCount() > 0 {
                return errors.WithStack(inUseError(p.Name()))
        }

        for _, typ := range p.GetTypes() {
                if typ.Capability == authorization.AuthZApiImplements {
                        pm.config.AuthzMiddleware.RemovePlugin(p.Name())
                }
        }

        if err := pm.disable(p, c); err != nil {
                return err
        }
        pm.publisher.Publish(EventDisable{Plugin: p.PluginObj})
        pm.config.LogPluginEvent(p.GetID(), refOrID, events.ActionDisable)
        return nil
}

// Enable activates a plugin, which implies that they are ready to be used by containers.
func (pm *Manager) Enable(refOrID string, config *backend.PluginEnableConfig) error {
        p, err := pm.config.Store.GetV2Plugin(refOrID)
        if err != nil {
                return err
        }

        c := &controller{timeoutInSecs: config.Timeout}
        if err := pm.enable(p, c, false); err != nil {
                return err
        }
        pm.publisher.Publish(EventEnable{Plugin: p.PluginObj})
        pm.config.LogPluginEvent(p.GetID(), refOrID, events.ActionEnable)
        return nil
}

// Inspect examines a plugin config
func (pm *Manager) Inspect(refOrID string) (*types.Plugin, error) {
        p, err := pm.config.Store.GetV2Plugin(refOrID)
        if err != nil {
                return nil, err
        }

        return &p.PluginObj, nil
}

func computePrivileges(c types.PluginConfig) types.PluginPrivileges {
        var privileges types.PluginPrivileges
        if c.Network.Type != "null" && c.Network.Type != "bridge" && c.Network.Type != "" {
                privileges = append(privileges, types.PluginPrivilege{
                        Name:        "network",
                        Description: "permissions to access a network",
                        Value:       []string{c.Network.Type},
                })
        }
        if c.IpcHost {
                privileges = append(privileges, types.PluginPrivilege{
                        Name:        "host ipc namespace",
                        Description: "allow access to host ipc namespace",
                        Value:       []string{"true"},
                })
        }
        if c.PidHost {
                privileges = append(privileges, types.PluginPrivilege{
                        Name:        "host pid namespace",
                        Description: "allow access to host pid namespace",
                        Value:       []string{"true"},
                })
        }
        for _, mnt := range c.Mounts {
                if mnt.Source != nil {
                        privileges = append(privileges, types.PluginPrivilege{
                                Name:        "mount",
                                Description: "host path to mount",
                                Value:       []string{*mnt.Source},
                        })
                }
        }
        for _, device := range c.Linux.Devices {
                if device.Path != nil {
                        privileges = append(privileges, types.PluginPrivilege{
                                Name:        "device",
                                Description: "host device to access",
                                Value:       []string{*device.Path},
                        })
                }
        }
        if c.Linux.AllowAllDevices {
                privileges = append(privileges, types.PluginPrivilege{
                        Name:        "allow-all-devices",
                        Description: "allow 'rwm' access to all devices",
                        Value:       []string{"true"},
                })
        }
        if len(c.Linux.Capabilities) > 0 {
                privileges = append(privileges, types.PluginPrivilege{
                        Name:        "capabilities",
                        Description: "list of additional capabilities required",
                        Value:       c.Linux.Capabilities,
                })
        }

        return privileges
}

// Privileges pulls a plugin config and computes the privileges required to install it.
func (pm *Manager) Privileges(ctx context.Context, ref reference.Named, metaHeader http.Header, authConfig *registry.AuthConfig) (types.PluginPrivileges, error) {
        var (
                config     types.PluginConfig
                configSeen bool
        )

        h := func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                switch desc.MediaType {
                case c8dimages.MediaTypeDockerSchema2Manifest, ocispec.MediaTypeImageManifest:
                        data, err := content.ReadBlob(ctx, pm.blobStore, desc)
                        if err != nil {
                                return nil, errors.Wrapf(err, "error reading image manifest from blob store for %s", ref)
                        }

                        var m ocispec.Manifest
                        if err := json.Unmarshal(data, &m); err != nil {
                                return nil, errors.Wrapf(err, "error unmarshaling image manifest for %s", ref)
                        }
                        return []ocispec.Descriptor{m.Config}, nil
                case MediaTypePluginConfig:
                        configSeen = true
                        data, err := content.ReadBlob(ctx, pm.blobStore, desc)
                        if err != nil {
                                return nil, errors.Wrapf(err, "error reading plugin config from blob store for %s", ref)
                        }

                        if err := json.Unmarshal(data, &config); err != nil {
                                return nil, errors.Wrapf(err, "error unmarshaling plugin config for %s", ref)
                        }
                }

                return nil, nil
        }

        if err := pm.fetch(ctx, ref, authConfig, progress.DiscardOutput(), metaHeader, c8dimages.HandlerFunc(h)); err != nil {
                return types.PluginPrivileges{}, nil
        }

        if !configSeen {
                return types.PluginPrivileges{}, errors.Errorf("did not find plugin config for specified reference %s", ref)
        }

        return computePrivileges(config), nil
}

// Upgrade upgrades a plugin
//
// TODO: replace reference package usage with simpler url.Parse semantics
func (pm *Manager) Upgrade(ctx context.Context, ref reference.Named, name string, metaHeader http.Header, authConfig *registry.AuthConfig, privileges types.PluginPrivileges, outStream io.Writer) error {
        p, err := pm.config.Store.GetV2Plugin(name)
        if err != nil {
                return err
        }

        if p.IsEnabled() {
                return errors.Wrap(enabledError(p.Name()), "plugin must be disabled before upgrading")
        }

        // revalidate because Pull is public
        if _, err := reference.ParseNormalizedNamed(name); err != nil {
                return errors.Wrapf(errdefs.InvalidParameter(err), "failed to parse %q", name)
        }

        pm.muGC.RLock()
        defer pm.muGC.RUnlock()

        tmpRootFSDir, err := os.MkdirTemp(pm.tmpDir(), ".rootfs")
        if err != nil {
                return errors.Wrap(err, "error creating tmp dir for plugin rootfs")
        }

        var md fetchMeta

        ctx, cancel := context.WithCancel(ctx)
        out, waitProgress := setupProgressOutput(outStream, cancel)
        defer waitProgress()

        if err := pm.fetch(ctx, ref, authConfig, out, metaHeader, storeFetchMetadata(&md), childrenHandler(pm.blobStore), applyLayer(pm.blobStore, tmpRootFSDir, out)); err != nil {
                return err
        }
        pm.config.LogPluginEvent(reference.FamiliarString(ref), name, events.ActionPull)

        if err := validateFetchedMetadata(md); err != nil {
                return err
        }

        if err := pm.upgradePlugin(p, md.config, md.manifest, md.blobs, tmpRootFSDir, &privileges); err != nil {
                return err
        }
        p.PluginObj.PluginReference = ref.String()
        return nil
}

// Pull pulls a plugin, check if the correct privileges are provided and install the plugin.
//
// TODO: replace reference package usage with simpler url.Parse semantics
func (pm *Manager) Pull(ctx context.Context, ref reference.Named, name string, metaHeader http.Header, authConfig *registry.AuthConfig, privileges types.PluginPrivileges, outStream io.Writer, opts ...CreateOpt) error {
        pm.muGC.RLock()
        defer pm.muGC.RUnlock()

        // revalidate because Pull is public
        nameref, err := reference.ParseNormalizedNamed(name)
        if err != nil {
                return errors.Wrapf(errdefs.InvalidParameter(err), "failed to parse %q", name)
        }
        name = reference.FamiliarString(reference.TagNameOnly(nameref))

        if err := pm.config.Store.validateName(name); err != nil {
                return errdefs.InvalidParameter(err)
        }

        tmpRootFSDir, err := os.MkdirTemp(pm.tmpDir(), ".rootfs")
        if err != nil {
                return errors.Wrap(errdefs.System(err), "error preparing upgrade")
        }
        defer os.RemoveAll(tmpRootFSDir)

        var md fetchMeta

        ctx, cancel := context.WithCancel(ctx)
        out, waitProgress := setupProgressOutput(outStream, cancel)
        defer waitProgress()

        if err := pm.fetch(ctx, ref, authConfig, out, metaHeader, storeFetchMetadata(&md), childrenHandler(pm.blobStore), applyLayer(pm.blobStore, tmpRootFSDir, out)); err != nil {
                return err
        }
        pm.config.LogPluginEvent(reference.FamiliarString(ref), name, events.ActionPull)

        if err := validateFetchedMetadata(md); err != nil {
                return err
        }

        refOpt := func(p *v2.Plugin) {
                p.PluginObj.PluginReference = ref.String()
        }
        optsList := make([]CreateOpt, 0, len(opts)+1)
        optsList = append(optsList, opts...)
        optsList = append(optsList, refOpt)

        // TODO: tmpRootFSDir is empty but should have layers in it
        p, err := pm.createPlugin(name, md.config, md.manifest, md.blobs, tmpRootFSDir, &privileges, optsList...)
        if err != nil {
                return err
        }

        pm.publisher.Publish(EventCreate{Plugin: p.PluginObj})

        return nil
}

// List displays the list of plugins and associated metadata.
func (pm *Manager) List(pluginFilters filters.Args) ([]types.Plugin, error) {
        if err := pluginFilters.Validate(acceptedPluginFilterTags); err != nil {
                return nil, err
        }

        enabledOnly := false
        disabledOnly := false
        if pluginFilters.Contains("enabled") {
                enabledFilter, err := pluginFilters.GetBoolOrDefault("enabled", false)
                if err != nil {
                        return nil, err
                }

                if enabledFilter {
                        enabledOnly = true
                } else {
                        disabledOnly = true
                }
        }

        plugins := pm.config.Store.GetAll()
        out := make([]types.Plugin, 0, len(plugins))

next:
        for _, p := range plugins {
                if enabledOnly && !p.PluginObj.Enabled {
                        continue
                }
                if disabledOnly && p.PluginObj.Enabled {
                        continue
                }
                if pluginFilters.Contains("capability") {
                        for _, f := range p.GetTypes() {
                                if !pluginFilters.Match("capability", f.Capability) {
                                        continue next
                                }
                        }
                }
                out = append(out, p.PluginObj)
        }
        return out, nil
}

// Push pushes a plugin to the registry.
func (pm *Manager) Push(ctx context.Context, name string, metaHeader http.Header, authConfig *registry.AuthConfig, outStream io.Writer) error {
        p, err := pm.config.Store.GetV2Plugin(name)
        if err != nil {
                return err
        }

        ref, err := reference.ParseNormalizedNamed(p.Name())
        if err != nil {
                return errors.Wrapf(err, "plugin has invalid name %v for push", p.Name())
        }

        statusTracker := docker.NewInMemoryTracker()

        resolver, err := pm.newResolver(ctx, statusTracker, authConfig, metaHeader, false)
        if err != nil {
                return err
        }

        pusher, err := resolver.Pusher(ctx, ref.String())
        if err != nil {
                return errors.Wrap(err, "error creating plugin pusher")
        }

        pj := newPushJobs(statusTracker)

        ctx, cancel := context.WithCancel(ctx)
        out, waitProgress := setupProgressOutput(outStream, cancel)
        defer waitProgress()

        progressHandler := c8dimages.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                log.G(ctx).WithField("mediaType", desc.MediaType).WithField("digest", desc.Digest.String()).Debug("Preparing to push plugin layer")
                id := stringid.TruncateID(desc.Digest.String())
                pj.add(remotes.MakeRefKey(ctx, desc), id)
                progress.Update(out, id, "Preparing")
                return nil, nil
        })

        desc, err := pm.getManifestDescriptor(ctx, p)
        if err != nil {
                return errors.Wrap(err, "error reading plugin manifest")
        }

        progress.Messagef(out, "", "The push refers to repository [%s]", reference.FamiliarName(ref))

        // TODO: If a layer already exists on the registry, the progress output just says "Preparing"
        go func() {
                timer := time.NewTimer(100 * time.Millisecond)
                defer timer.Stop()
                if !timer.Stop() {
                        <-timer.C
                }
                var statuses []contentStatus
                for {
                        timer.Reset(100 * time.Millisecond)
                        select {
                        case <-ctx.Done():
                                return
                        case <-timer.C:
                                statuses = pj.status()
                        }

                        for _, s := range statuses {
                                out.WriteProgress(progress.Progress{ID: s.Ref, Current: s.Offset, Total: s.Total, Action: s.Status, LastUpdate: s.Offset == s.Total})
                        }
                }
        }()

        // Make sure we can authenticate the request since the auth scope for plugin repos is different than a normal repo.
        ctx = docker.WithScope(ctx, scope(ref, true))
        if err := remotes.PushContent(ctx, pusher, desc, pm.blobStore, nil, nil, func(h c8dimages.Handler) c8dimages.Handler {
                return c8dimages.Handlers(progressHandler, h)
        }); err != nil {
                // Try fallback to http.
                // This is needed because the containerd pusher will only attempt the first registry config we pass, which would
                // typically be https.
                // If there are no http-only host configs found we'll error out anyway.
                resolver, _ := pm.newResolver(ctx, statusTracker, authConfig, metaHeader, true)
                if resolver != nil {
                        pusher, _ := resolver.Pusher(ctx, ref.String())
                        if pusher != nil {
                                log.G(ctx).WithField("ref", ref).Debug("Re-attmpting push with http-fallback")
                                err2 := remotes.PushContent(ctx, pusher, desc, pm.blobStore, nil, nil, func(h c8dimages.Handler) c8dimages.Handler {
                                        return c8dimages.Handlers(progressHandler, h)
                                })
                                if err2 == nil {
                                        err = nil
                                } else {
                                        log.G(ctx).WithError(err2).WithField("ref", ref).Debug("Error while attempting push with http-fallback")
                                }
                        }
                }
                if err != nil {
                        return errors.Wrap(err, "error pushing plugin")
                }
        }

        // For blobs that already exist in the registry we need to make sure to update the progress otherwise it will just say "pending"
        // TODO: How to check if the layer already exists? Is it worth it?
        for _, j := range pj.jobs {
                progress.Update(out, pj.names[j], "Upload complete")
        }

        // Signal the client for content trust verification
        progress.Aux(out, types.PushResult{Tag: ref.(reference.Tagged).Tag(), Digest: desc.Digest.String(), Size: int(desc.Size)})

        return nil
}

// manifest wraps an OCI manifest, because...
// Historically the registry does not support plugins unless the media type on the manifest is specifically schema2.MediaTypeManifest
// So the OCI manifest media type is not supported.
// Additionally, there is extra validation for the docker schema2 manifest than there is a mediatype set on the manifest itself
// even though this is set on the descriptor
// The OCI types do not have this field.
type manifest struct {
        ocispec.Manifest
        MediaType string `json:"mediaType,omitempty"`
}

func buildManifest(ctx context.Context, s content.Manager, config digest.Digest, layers []digest.Digest) (manifest, error) {
        var m manifest
        m.MediaType = c8dimages.MediaTypeDockerSchema2Manifest
        m.SchemaVersion = 2

        configInfo, err := s.Info(ctx, config)
        if err != nil {
                return m, errors.Wrapf(err, "error reading plugin config content for digest %s", config)
        }
        m.Config = ocispec.Descriptor{
                MediaType: mediaTypePluginConfig,
                Size:      configInfo.Size,
                Digest:    configInfo.Digest,
        }

        for _, l := range layers {
                info, err := s.Info(ctx, l)
                if err != nil {
                        return m, errors.Wrapf(err, "error fetching info for content digest %s", l)
                }
                m.Layers = append(m.Layers, ocispec.Descriptor{
                        MediaType: c8dimages.MediaTypeDockerSchema2LayerGzip, // TODO: This is assuming everything is a gzip compressed layer, but that may not be true.
                        Digest:    l,
                        Size:      info.Size,
                })
        }
        return m, nil
}

// getManifestDescriptor gets the OCI descriptor for a manifest
// It will generate a manifest if one does not exist
func (pm *Manager) getManifestDescriptor(ctx context.Context, p *v2.Plugin) (ocispec.Descriptor, error) {
        logger := log.G(ctx).WithField("plugin", p.Name()).WithField("digest", p.Manifest)
        if p.Manifest != "" {
                info, err := pm.blobStore.Info(ctx, p.Manifest)
                if err == nil {
                        desc := ocispec.Descriptor{
                                Size:      info.Size,
                                Digest:    info.Digest,
                                MediaType: c8dimages.MediaTypeDockerSchema2Manifest,
                        }
                        return desc, nil
                }
                logger.WithError(err).Debug("Could not find plugin manifest in content store")
        } else {
                logger.Info("Plugin does not have manifest digest")
        }
        logger.Info("Building a new plugin manifest")

        mfst, err := buildManifest(ctx, pm.blobStore, p.Config, p.Blobsums)
        if err != nil {
                return ocispec.Descriptor{}, err
        }

        desc, err := writeManifest(ctx, pm.blobStore, &mfst)
        if err != nil {
                return desc, err
        }

        if err := pm.save(p); err != nil {
                logger.WithError(err).Error("Could not save plugin with manifest digest")
        }
        return desc, nil
}

func writeManifest(ctx context.Context, cs content.Store, m *manifest) (ocispec.Descriptor, error) {
        platform := platforms.DefaultSpec()
        desc := ocispec.Descriptor{
                MediaType: c8dimages.MediaTypeDockerSchema2Manifest,
                Platform:  &platform,
        }
        data, err := json.Marshal(m)
        if err != nil {
                return desc, errors.Wrap(err, "error encoding manifest")
        }
        desc.Digest = digest.FromBytes(data)
        desc.Size = int64(len(data))

        if err := content.WriteBlob(ctx, cs, remotes.MakeRefKey(ctx, desc), bytes.NewReader(data), desc); err != nil {
                return desc, errors.Wrap(err, "error writing plugin manifest")
        }
        return desc, nil
}

// Remove deletes plugin's root directory.
func (pm *Manager) Remove(name string, config *backend.PluginRmConfig) error {
        p, err := pm.config.Store.GetV2Plugin(name)
        pm.mu.RLock()
        c := pm.cMap[p]
        pm.mu.RUnlock()

        if err != nil {
                return err
        }

        if !config.ForceRemove {
                if p.GetRefCount() > 0 {
                        return inUseError(p.Name())
                }
                if p.IsEnabled() {
                        return enabledError(p.Name())
                }
        }

        if p.IsEnabled() {
                if err := pm.disable(p, c); err != nil {
                        log.G(context.TODO()).Errorf("failed to disable plugin '%s': %s", p.Name(), err)
                }
        }

        defer func() {
                go pm.GC()
        }()

        id := p.GetID()
        pluginDir := filepath.Join(pm.config.Root, id)

        if err := mount.RecursiveUnmount(pluginDir); err != nil {
                return errors.Wrap(err, "error unmounting plugin data")
        }

        if err := atomicRemoveAll(pluginDir); err != nil {
                return err
        }

        pm.config.Store.Remove(p)
        pm.config.LogPluginEvent(id, name, events.ActionRemove)
        pm.publisher.Publish(EventRemove{Plugin: p.PluginObj})
        return nil
}

// Set sets plugin args
func (pm *Manager) Set(name string, args []string) error {
        p, err := pm.config.Store.GetV2Plugin(name)
        if err != nil {
                return err
        }
        if err := p.Set(args); err != nil {
                return err
        }
        return pm.save(p)
}

// CreateFromContext creates a plugin from the given pluginDir which contains
// both the rootfs and the config.json and a repoName with optional tag.
func (pm *Manager) CreateFromContext(ctx context.Context, tarCtx io.ReadCloser, options *types.PluginCreateOptions) (retErr error) {
        pm.muGC.RLock()
        defer pm.muGC.RUnlock()

        ref, err := reference.ParseNormalizedNamed(options.RepoName)
        if err != nil {
                return errors.Wrapf(err, "failed to parse reference %v", options.RepoName)
        }
        if _, ok := ref.(reference.Canonical); ok {
                return errors.Errorf("canonical references are not permitted")
        }
        name := reference.FamiliarString(reference.TagNameOnly(ref))

        if err := pm.config.Store.validateName(name); err != nil { // fast check, real check is in createPlugin()
                return err
        }

        tmpRootFSDir, err := os.MkdirTemp(pm.tmpDir(), ".rootfs")
        if err != nil {
                return errors.Wrap(err, "failed to create temp directory")
        }
        defer func() {
                if err := os.RemoveAll(tmpRootFSDir); err != nil {
                        log.G(ctx).WithError(err).Warn("failed to remove temp rootfs directory")
                }
        }()

        var configJSON []byte
        rootFS := splitConfigRootFSFromTar(tarCtx, &configJSON)

        rootFSBlob, err := pm.blobStore.Writer(ctx, content.WithRef(name))
        if err != nil {
                return err
        }
        defer rootFSBlob.Close()

        gzw := gzip.NewWriter(rootFSBlob)
        rootFSReader := io.TeeReader(rootFS, gzw)

        if err := chrootarchive.Untar(rootFSReader, tmpRootFSDir, nil); err != nil {
                return err
        }
        if err := rootFS.Close(); err != nil {
                return err
        }

        if configJSON == nil {
                return errors.New("config not found")
        }

        if err := gzw.Close(); err != nil {
                return errors.Wrap(err, "error closing gzip writer")
        }

        var config types.PluginConfig
        if err := json.Unmarshal(configJSON, &config); err != nil {
                return errors.Wrap(err, "failed to parse config")
        }

        if err := pm.validateConfig(config); err != nil {
                return err
        }

        pm.mu.Lock()
        defer pm.mu.Unlock()

        if err := rootFSBlob.Commit(ctx, 0, ""); err != nil {
                return err
        }
        defer func() {
                if retErr != nil {
                        go pm.GC()
                }
        }()

        config.Rootfs = &types.PluginConfigRootfs{
                Type:    "layers",
                DiffIds: []string{rootFSBlob.Digest().String()},
        }

        config.DockerVersion = dockerversion.Version

        configBlob, err := pm.blobStore.Writer(ctx, content.WithRef(name+"-config.json"))
        if err != nil {
                return err
        }
        defer configBlob.Close()
        if err := json.NewEncoder(configBlob).Encode(config); err != nil {
                return errors.Wrap(err, "error encoding json config")
        }
        if err := configBlob.Commit(ctx, 0, ""); err != nil {
                return err
        }

        configDigest := configBlob.Digest()
        layers := []digest.Digest{rootFSBlob.Digest()}

        mfst, err := buildManifest(ctx, pm.blobStore, configDigest, layers)
        if err != nil {
                return err
        }
        desc, err := writeManifest(ctx, pm.blobStore, &mfst)
        if err != nil {
                return err
        }

        p, err := pm.createPlugin(name, configDigest, desc.Digest, layers, tmpRootFSDir, nil)
        if err != nil {
                return err
        }
        p.PluginObj.PluginReference = name

        pm.publisher.Publish(EventCreate{Plugin: p.PluginObj})
        pm.config.LogPluginEvent(p.PluginObj.ID, name, events.ActionCreate)

        return nil
}

func (pm *Manager) validateConfig(config types.PluginConfig) error {
        return nil // TODO:
}

func splitConfigRootFSFromTar(in io.ReadCloser, config *[]byte) io.ReadCloser {
        pr, pw := io.Pipe()
        go func() {
                tarReader := tar.NewReader(in)
                tarWriter := tar.NewWriter(pw)
                defer in.Close()

                hasRootFS := false

                for {
                        hdr, err := tarReader.Next()
                        if err == io.EOF {
                                if !hasRootFS {
                                        pw.CloseWithError(errors.Wrap(err, "no rootfs found"))
                                        return
                                }
                                // Signals end of archive.
                                tarWriter.Close()
                                pw.Close()
                                return
                        }
                        if err != nil {
                                pw.CloseWithError(errors.Wrap(err, "failed to read from tar"))
                                return
                        }

                        content := io.Reader(tarReader)
                        name := path.Clean(hdr.Name)
                        if path.IsAbs(name) {
                                name = name[1:]
                        }
                        if name == configFileName {
                                dt, err := io.ReadAll(content)
                                if err != nil {
                                        pw.CloseWithError(errors.Wrapf(err, "failed to read %s", configFileName))
                                        return
                                }
                                *config = dt
                        }
                        if parts := strings.Split(name, "/"); len(parts) != 0 && parts[0] == rootFSFileName {
                                hdr.Name = path.Clean(path.Join(parts[1:]...))
                                if hdr.Typeflag == tar.TypeLink && strings.HasPrefix(strings.ToLower(hdr.Linkname), rootFSFileName+"/") {
                                        hdr.Linkname = hdr.Linkname[len(rootFSFileName)+1:]
                                }
                                if err := tarWriter.WriteHeader(hdr); err != nil {
                                        pw.CloseWithError(errors.Wrap(err, "error writing tar header"))
                                        return
                                }
                                if _, err := pools.Copy(tarWriter, content); err != nil {
                                        pw.CloseWithError(errors.Wrap(err, "error copying tar data"))
                                        return
                                }
                                hasRootFS = true
                        } else {
                                io.Copy(io.Discard, content)
                        }
                }
        }()
        return pr
}

func atomicRemoveAll(dir string) error {
        renamed := dir + "-removing"

        err := os.Rename(dir, renamed)
        switch {
        case os.IsNotExist(err), err == nil:
                // even if `dir` doesn't exist, we can still try and remove `renamed`
        case os.IsExist(err):
                // Some previous remove failed, check if the origin dir exists
                if e := containerfs.EnsureRemoveAll(renamed); e != nil {
                        return errors.Wrap(err, "rename target already exists and could not be removed")
                }
                if _, err := os.Stat(dir); os.IsNotExist(err) {
                        // origin doesn't exist, nothing left to do
                        return nil
                }

                // attempt to rename again
                if err := os.Rename(dir, renamed); err != nil {
                        return errors.Wrap(err, "failed to rename dir for atomic removal")
                }
        default:
                return errors.Wrap(err, "failed to rename dir for atomic removal")
        }

        if err := containerfs.EnsureRemoveAll(renamed); err != nil {
                os.Rename(renamed, dir)
                return err
        }
        return nil
}

package plugin

import (
        "strings"
        "sync"

        v2 "github.com/docker/docker/daemon/pkg/plugin/v2"
        "github.com/docker/docker/pkg/plugins"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// Store manages the plugin inventory in memory and on-disk
type Store struct {
        sync.RWMutex
        plugins  map[string]*v2.Plugin
        specOpts map[string][]SpecOpt
        /* handlers are necessary for transition path of legacy plugins
         * to the new model. Legacy plugins use Handle() for registering an
         * activation callback.*/
        handlers map[string][]func(string, *plugins.Client)
}

// NewStore creates a Store.
func NewStore() *Store {
        return &Store{
                plugins:  make(map[string]*v2.Plugin),
                specOpts: make(map[string][]SpecOpt),
                handlers: make(map[string][]func(string, *plugins.Client)),
        }
}

// SpecOpt is used for subsystems that need to modify the runtime spec of a plugin
type SpecOpt func(*specs.Spec)

// CreateOpt is used to configure specific plugin details when created
type CreateOpt func(p *v2.Plugin)

// WithSwarmService is a CreateOpt that flags the passed in a plugin as a plugin
// managed by swarm
func WithSwarmService(id string) CreateOpt {
        return func(p *v2.Plugin) {
                p.SwarmServiceID = id
        }
}

// WithEnv is a CreateOpt that passes the user-provided environment variables
// to the plugin container, de-duplicating variables with the same names case
// sensitively and only appends valid key=value pairs
func WithEnv(env []string) CreateOpt {
        return func(p *v2.Plugin) {
                effectiveEnv := make(map[string]string)
                for _, penv := range p.PluginObj.Config.Env {
                        if penv.Value != nil {
                                effectiveEnv[penv.Name] = *penv.Value
                        }
                }
                for _, line := range env {
                        if k, v, ok := strings.Cut(line, "="); ok {
                                effectiveEnv[k] = v
                        }
                }
                p.PluginObj.Settings.Env = make([]string, 0, len(effectiveEnv))
                for key, value := range effectiveEnv {
                        p.PluginObj.Settings.Env = append(p.PluginObj.Settings.Env, key+"="+value)
                }
        }
}

// WithSpecMounts is a SpecOpt which appends the provided mounts to the runtime spec
func WithSpecMounts(mounts []specs.Mount) SpecOpt {
        return func(s *specs.Spec) {
                s.Mounts = append(s.Mounts, mounts...)
        }
}

package plugin

import "fmt"

type errNotFound string

func (name errNotFound) Error() string {
        return fmt.Sprintf("plugin %q not found", string(name))
}

func (errNotFound) NotFound() {}

type errAmbiguous string

func (name errAmbiguous) Error() string {
        return fmt.Sprintf("multiple plugins found for %q", string(name))
}

func (name errAmbiguous) InvalidParameter() {}

type errDisabled string

func (name errDisabled) Error() string {
        return fmt.Sprintf("plugin %s found but disabled", string(name))
}

func (name errDisabled) Conflict() {}

type inUseError string

func (e inUseError) Error() string {
        return "plugin " + string(e) + " is in use"
}

func (inUseError) Conflict() {}

type enabledError string

func (e enabledError) Error() string {
        return "plugin " + string(e) + " is enabled"
}

func (enabledError) Conflict() {}

type alreadyExistsError string

func (e alreadyExistsError) Error() string {
        return "plugin " + string(e) + " already exists"
}

func (alreadyExistsError) Conflict() {}

package plugin

import (
        "fmt"
        "reflect"

        "github.com/moby/moby/api/types"
)

// Event is emitted for actions performed on the plugin manager
type Event interface {
        matches(Event) bool
}

// EventCreate is an event which is emitted when a plugin is created
// This is either by pull or create from context.
//
// Use the `Interfaces` field to match only plugins that implement a specific
// interface.
// These are matched against using "or" logic.
// If no interfaces are listed, all are matched.
type EventCreate struct {
        Interfaces map[string]bool
        Plugin     types.Plugin
}

func (e EventCreate) matches(observed Event) bool {
        oe, ok := observed.(EventCreate)
        if !ok {
                return false
        }
        if len(e.Interfaces) == 0 {
                return true
        }

        var ifaceMatch bool
        for _, in := range oe.Plugin.Config.Interface.Types {
                if e.Interfaces[in.Capability] {
                        ifaceMatch = true
                        break
                }
        }
        return ifaceMatch
}

// EventRemove is an event which is emitted when a plugin is removed
// It matches on the passed in plugin's ID only.
type EventRemove struct {
        Plugin types.Plugin
}

func (e EventRemove) matches(observed Event) bool {
        oe, ok := observed.(EventRemove)
        if !ok {
                return false
        }
        return e.Plugin.ID == oe.Plugin.ID
}

// EventDisable is an event that is emitted when a plugin is disabled
// It matches on the passed in plugin's ID only.
type EventDisable struct {
        Plugin types.Plugin
}

func (e EventDisable) matches(observed Event) bool {
        oe, ok := observed.(EventDisable)
        if !ok {
                return false
        }
        return e.Plugin.ID == oe.Plugin.ID
}

// EventEnable is an event that is emitted when a plugin is disabled
// It matches on the passed in plugin's ID only.
type EventEnable struct {
        Plugin types.Plugin
}

func (e EventEnable) matches(observed Event) bool {
        oe, ok := observed.(EventEnable)
        if !ok {
                return false
        }
        return e.Plugin.ID == oe.Plugin.ID
}

// SubscribeEvents provides an event channel to listen for structured events from
// the plugin manager actions, CRUD operations.
// The caller must call the returned `cancel()` function once done with the channel
// or this will leak resources.
func (pm *Manager) SubscribeEvents(buffer int, watchEvents ...Event) (eventCh <-chan interface{}, cancel func()) {
        topic := func(i interface{}) bool {
                observed, ok := i.(Event)
                if !ok {
                        panic(fmt.Sprintf("unexpected type passed to event channel: %v", reflect.TypeOf(i)))
                }
                for _, e := range watchEvents {
                        if e.matches(observed) {
                                return true
                        }
                }
                // If no specific events are specified always assume a matched event
                // If some events were specified and none matched above, then the event
                // doesn't match
                return watchEvents == nil
        }
        ch := pm.publisher.SubscribeTopicWithBuffer(topic, buffer)
        cancelFunc := func() { pm.publisher.Evict(ch) }
        return ch, cancelFunc
}

package plugin

import (
        "context"
        "io"
        "net/http"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        c8dimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        progressutils "github.com/docker/docker/distribution/utils"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/go-archive/chrootarchive"
        "github.com/moby/moby/api/types/registry"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

const mediaTypePluginConfig = "application/vnd.docker.plugin.v1+json"

// setupProgressOutput sets up the passed in writer to stream progress.
//
// The passed in cancel function is used by the progress writer to signal callers that there
// is an issue writing to the stream.
//
// The returned function is used to wait for the progress writer to be finished.
// Call it to make sure the progress writer is done before returning from your function as needed.
func setupProgressOutput(outStream io.Writer, cancel func()) (progress.Output, func()) {
        var out progress.Output
        f := func() {}

        if outStream != nil {
                ch := make(chan progress.Progress, 100)
                out = progress.ChanOutput(ch)

                ctx, retCancel := context.WithCancel(context.Background())
                go func() {
                        progressutils.WriteDistributionProgress(cancel, outStream, ch)
                        retCancel()
                }()

                f = func() {
                        close(ch)
                        <-ctx.Done()
                }
        } else {
                out = progress.DiscardOutput()
        }
        return out, f
}

// fetch the content related to the passed in reference into the blob store and appends the provided c8dimages.Handlers
// There is no need to use remotes.FetchHandler since it already gets set
func (pm *Manager) fetch(ctx context.Context, ref reference.Named, auth *registry.AuthConfig, out progress.Output, metaHeader http.Header, handlers ...c8dimages.Handler) error {
        // We need to make sure we have a domain on the reference
        withDomain, err := reference.ParseNormalizedNamed(ref.String())
        if err != nil {
                return errors.Wrap(err, "error parsing plugin image reference")
        }

        // Make sure we can authenticate the request since the auth scope for plugin repos is different than a normal repo.
        ctx = docker.WithScope(ctx, scope(ref, false))

        // Make sure the fetch handler knows how to set a ref key for the plugin media type.
        // Without this the ref key is "unknown" and we see a nasty warning message in the logs
        ctx = remotes.WithMediaTypeKeyPrefix(ctx, mediaTypePluginConfig, "docker-plugin")

        resolver, err := pm.newResolver(ctx, nil, auth, metaHeader, false)
        if err != nil {
                return err
        }
        resolved, desc, err := resolver.Resolve(ctx, withDomain.String())
        if err != nil {
                // This is backwards compatible with older versions of the distribution registry.
                // The containerd client will add it's own accept header as a comma separated list of supported manifests.
                // This is perfectly fine, unless you are talking to an older registry which does not split the comma separated list,
                //   so it is never able to match a media type and it falls back to schema1 (yuck) and fails because our manifest the
                //   fallback does not support plugin configs...
                log.G(ctx).WithError(err).WithField("ref", withDomain).Debug("Error while resolving reference, falling back to backwards compatible accept header format")
                headers := http.Header{}
                headers.Add("Accept", c8dimages.MediaTypeDockerSchema2Manifest)
                headers.Add("Accept", c8dimages.MediaTypeDockerSchema2ManifestList)
                headers.Add("Accept", ocispec.MediaTypeImageManifest)
                headers.Add("Accept", ocispec.MediaTypeImageIndex)
                resolver, _ = pm.newResolver(ctx, nil, auth, headers, false)
                if resolver != nil {
                        resolved, desc, err = resolver.Resolve(ctx, withDomain.String())
                        if err != nil {
                                log.G(ctx).WithError(err).WithField("ref", withDomain).Debug("Failed to resolve reference after falling back to backwards compatible accept header format")
                        }
                }
                if err != nil {
                        return errors.Wrap(err, "error resolving plugin reference")
                }
        }

        fetcher, err := resolver.Fetcher(ctx, resolved)
        if err != nil {
                return errors.Wrap(err, "error creating plugin image fetcher")
        }

        fp := withFetchProgress(pm.blobStore, out, ref)
        handlers = append([]c8dimages.Handler{fp, remotes.FetchHandler(pm.blobStore, fetcher)}, handlers...)
        return c8dimages.Dispatch(ctx, c8dimages.Handlers(handlers...), nil, desc)
}

// applyLayer makes an c8dimages.HandlerFunc which applies a fetched image rootfs layer to a directory.
//
// TODO(@cpuguy83) This gets run sequentially after layer pull (makes sense), however
// if there are multiple layers to fetch we may end up extracting layers in the wrong
// order.
func applyLayer(cs content.Store, dir string, out progress.Output) c8dimages.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                switch desc.MediaType {
                case
                        ocispec.MediaTypeImageLayer,
                        c8dimages.MediaTypeDockerSchema2Layer,
                        ocispec.MediaTypeImageLayerGzip,
                        c8dimages.MediaTypeDockerSchema2LayerGzip:
                default:
                        return nil, nil
                }

                ra, err := cs.ReaderAt(ctx, desc)
                if err != nil {
                        return nil, errors.Wrapf(err, "error getting content from content store for digest %s", desc.Digest)
                }

                id := stringid.TruncateID(desc.Digest.String())

                rc := ioutils.NewReadCloserWrapper(content.NewReader(ra), ra.Close)
                pr := progress.NewProgressReader(rc, out, desc.Size, id, "Extracting")
                defer pr.Close()

                if _, err := chrootarchive.ApplyLayer(dir, pr); err != nil {
                        return nil, errors.Wrapf(err, "error applying layer for digest %s", desc.Digest)
                }
                progress.Update(out, id, "Complete")
                return nil, nil
        }
}

func childrenHandler(cs content.Store) c8dimages.HandlerFunc {
        ch := c8dimages.ChildrenHandler(cs)
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                switch desc.MediaType {
                case mediaTypePluginConfig:
                        return nil, nil
                default:
                        return ch(ctx, desc)
                }
        }
}

type fetchMeta struct {
        blobs    []digest.Digest
        config   digest.Digest
        manifest digest.Digest
}

func storeFetchMetadata(m *fetchMeta) c8dimages.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                switch desc.MediaType {
                case
                        c8dimages.MediaTypeDockerSchema2LayerForeignGzip,
                        c8dimages.MediaTypeDockerSchema2Layer,
                        ocispec.MediaTypeImageLayer,
                        ocispec.MediaTypeImageLayerGzip:
                        m.blobs = append(m.blobs, desc.Digest)
                case ocispec.MediaTypeImageManifest, c8dimages.MediaTypeDockerSchema2Manifest:
                        m.manifest = desc.Digest
                case mediaTypePluginConfig:
                        m.config = desc.Digest
                }
                return nil, nil
        }
}

func validateFetchedMetadata(md fetchMeta) error {
        if md.config == "" {
                return errors.New("fetched plugin image but plugin config is missing")
        }
        if md.manifest == "" {
                return errors.New("fetched plugin image but manifest is missing")
        }
        return nil
}

// withFetchProgress is a fetch handler which registers a descriptor with a progress
func withFetchProgress(cs content.Store, out progress.Output, ref reference.Named) c8dimages.HandlerFunc {
        return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) {
                switch desc.MediaType {
                case ocispec.MediaTypeImageManifest, c8dimages.MediaTypeDockerSchema2Manifest:
                        tn := reference.TagNameOnly(ref)
                        var tagOrDigest string
                        if tagged, ok := tn.(reference.Tagged); ok {
                                tagOrDigest = tagged.Tag()
                        } else {
                                tagOrDigest = tn.String()
                        }
                        progress.Messagef(out, tagOrDigest, "Pulling from %s", reference.FamiliarName(ref))
                        progress.Messagef(out, "", "Digest: %s", desc.Digest.String())
                        return nil, nil
                case
                        c8dimages.MediaTypeDockerSchema2LayerGzip,
                        c8dimages.MediaTypeDockerSchema2Layer,
                        ocispec.MediaTypeImageLayer,
                        ocispec.MediaTypeImageLayerGzip:
                default:
                        return nil, nil
                }

                id := stringid.TruncateID(desc.Digest.String())

                if _, err := cs.Info(ctx, desc.Digest); err == nil {
                        out.WriteProgress(progress.Progress{ID: id, Action: "Already exists", LastUpdate: true})
                        return nil, nil
                }

                progress.Update(out, id, "Waiting")

                key := remotes.MakeRefKey(ctx, desc)

                go func() {
                        timer := time.NewTimer(100 * time.Millisecond)
                        if !timer.Stop() {
                                <-timer.C
                        }
                        defer timer.Stop()

                        var pulling bool
                        var (
                                // make sure we can still fetch from the content store
                                // if the main context is cancelled
                                // TODO: Might need to add some sort of timeout; see https://github.com/moby/moby/issues/49413
                                ctxErr      error
                                noCancelCTX = context.WithoutCancel(ctx)
                        )

                        for {
                                timer.Reset(100 * time.Millisecond)

                                select {
                                case <-ctx.Done():
                                        ctxErr = ctx.Err()
                                case <-timer.C:
                                }

                                s, err := cs.Status(noCancelCTX, key)
                                if err != nil {
                                        if !cerrdefs.IsNotFound(err) {
                                                log.G(noCancelCTX).WithError(err).WithField("layerDigest", desc.Digest.String()).Error("Error looking up status of plugin layer pull")
                                                progress.Update(out, id, err.Error())
                                                return
                                        }

                                        if _, err := cs.Info(noCancelCTX, desc.Digest); err == nil {
                                                progress.Update(out, id, "Download complete")
                                                return
                                        }

                                        if ctxErr != nil {
                                                progress.Update(out, id, ctxErr.Error())
                                                return
                                        }

                                        continue
                                }

                                if !pulling {
                                        progress.Update(out, id, "Pulling fs layer")
                                        pulling = true
                                }

                                if s.Offset == s.Total {
                                        out.WriteProgress(progress.Progress{ID: id, Action: "Download complete", Current: s.Offset, LastUpdate: true})
                                        return
                                }

                                out.WriteProgress(progress.Progress{ID: id, Action: "Downloading", Current: s.Offset, Total: s.Total})
                        }
                }()
                return nil, nil
        }
}

package plugin

import (
        "context"
        "encoding/json"
        "io"
        "os"
        "path/filepath"
        "reflect"
        "sort"
        "strings"
        "sync"
        "syscall"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/plugins/content/local"
        "github.com/containerd/log"
        v2 "github.com/docker/docker/daemon/pkg/plugin/v2"
        "github.com/docker/docker/internal/containerfs"
        "github.com/docker/docker/internal/lazyregexp"
        "github.com/docker/docker/pkg/authorization"
        "github.com/docker/docker/registry"
        "github.com/moby/moby/api/types"
        "github.com/moby/moby/api/types/events"
        "github.com/moby/pubsub"
        "github.com/moby/sys/atomicwriter"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/pkg/errors"
        "github.com/sirupsen/logrus"
)

const (
        configFileName = "config.json"
        rootFSFileName = "rootfs"
)

var validFullID = lazyregexp.New(`^([a-f0-9]{64})$`)

// Executor is the interface that the plugin manager uses to interact with for starting/stopping plugins
type Executor interface {
        Create(id string, spec specs.Spec, stdout, stderr io.WriteCloser) error
        IsRunning(id string) (bool, error)
        Restore(id string, stdout, stderr io.WriteCloser) (alive bool, err error)
        Signal(id string, signal syscall.Signal) error
}

// EndpointResolver provides looking up registry endpoints for pulling.
type EndpointResolver interface {
        LookupPullEndpoints(hostname string) (endpoints []registry.APIEndpoint, err error)
}

func (pm *Manager) restorePlugin(p *v2.Plugin, c *controller) error {
        if p.IsEnabled() {
                return pm.restore(p, c)
        }
        return nil
}

type eventLogger func(id, name string, action events.Action)

// ManagerConfig defines configuration needed to start new manager.
type ManagerConfig struct {
        Store              *Store // remove
        RegistryService    EndpointResolver
        LiveRestoreEnabled bool // TODO: remove
        LogPluginEvent     eventLogger
        Root               string
        ExecRoot           string
        CreateExecutor     ExecutorCreator
        AuthzMiddleware    *authorization.Middleware
}

// ExecutorCreator is used in the manager config to pass in an `Executor`
type ExecutorCreator func(*Manager) (Executor, error)

// Manager controls the plugin subsystem.
type Manager struct {
        config    ManagerConfig
        mu        sync.RWMutex // protects cMap
        muGC      sync.RWMutex // protects blobstore deletions
        cMap      map[*v2.Plugin]*controller
        blobStore content.Store
        publisher *pubsub.Publisher
        executor  Executor
}

// controller represents the manager's control on a plugin.
type controller struct {
        restart       bool
        exitChan      chan bool
        timeoutInSecs int
}

// NewManager returns a new plugin manager.
func NewManager(config ManagerConfig) (*Manager, error) {
        manager := &Manager{
                config: config,
        }
        for _, dirName := range []string{manager.config.Root, manager.config.ExecRoot, manager.tmpDir()} {
                if err := os.MkdirAll(dirName, 0o700); err != nil {
                        return nil, errors.Wrapf(err, "failed to mkdir %v", dirName)
                }
        }
        var err error
        manager.executor, err = config.CreateExecutor(manager)
        if err != nil {
                return nil, err
        }

        manager.blobStore, err = local.NewStore(filepath.Join(manager.config.Root, "storage"))
        if err != nil {
                return nil, errors.Wrap(err, "error creating plugin blob store")
        }

        manager.cMap = make(map[*v2.Plugin]*controller)
        if err := manager.reload(); err != nil {
                return nil, errors.Wrap(err, "failed to restore plugins")
        }

        manager.publisher = pubsub.NewPublisher(0, 0)
        return manager, nil
}

func (pm *Manager) tmpDir() string {
        return filepath.Join(pm.config.Root, "tmp")
}

// HandleExitEvent is called when the executor receives the exit event
// In the future we may change this, but for now all we care about is the exit event.
func (pm *Manager) HandleExitEvent(id string) error {
        p, err := pm.config.Store.GetV2Plugin(id)
        if err != nil {
                return err
        }

        if err := os.RemoveAll(filepath.Join(pm.config.ExecRoot, id)); err != nil {
                log.G(context.TODO()).WithError(err).WithField("id", id).Error("Could not remove plugin bundle dir")
        }

        pm.mu.RLock()
        c := pm.cMap[p]
        if c.exitChan != nil {
                close(c.exitChan)
                c.exitChan = nil // ignore duplicate events (containerd issue #2299)
        }
        restart := c.restart
        pm.mu.RUnlock()

        if restart {
                pm.enable(p, c, true)
        } else if err := recursiveUnmount(filepath.Join(pm.config.Root, id)); err != nil {
                return errors.Wrap(err, "error cleaning up plugin mounts")
        }
        return nil
}

func handleLoadError(err error, id string) {
        if err == nil {
                return
        }
        logger := log.G(context.TODO()).WithError(err).WithField("id", id)
        if errors.Is(err, os.ErrNotExist) {
                // Likely some error while removing on an older version of docker
                logger.Warn("missing plugin config, skipping: this may be caused due to a failed remove and requires manual cleanup.")
                return
        }
        logger.Error("error loading plugin, skipping")
}

func (pm *Manager) reload() error { // todo: restore
        dir, err := os.ReadDir(pm.config.Root)
        if err != nil {
                return errors.Wrapf(err, "failed to read %v", pm.config.Root)
        }
        plugins := make(map[string]*v2.Plugin)
        for _, v := range dir {
                if validFullID.MatchString(v.Name()) {
                        p, err := pm.loadPlugin(v.Name())
                        if err != nil {
                                handleLoadError(err, v.Name())
                                continue
                        }
                        plugins[p.GetID()] = p
                } else {
                        if validFullID.MatchString(strings.TrimSuffix(v.Name(), "-removing")) {
                                // There was likely some error while removing this plugin, let's try to remove again here
                                if err := containerfs.EnsureRemoveAll(v.Name()); err != nil {
                                        log.G(context.TODO()).WithError(err).WithField("id", v.Name()).Warn("error while attempting to clean up previously removed plugin")
                                }
                        }
                }
        }

        pm.config.Store.SetAll(plugins)

        var wg sync.WaitGroup
        wg.Add(len(plugins))
        for _, p := range plugins {
                c := &controller{exitChan: make(chan bool)}
                pm.mu.Lock()
                pm.cMap[p] = c
                pm.mu.Unlock()

                go func(p *v2.Plugin) {
                        defer wg.Done()
                        // TODO(thaJeztah): make this fail if the plugin has "graphdriver" capability ?
                        if err := pm.restorePlugin(p, c); err != nil {
                                log.G(context.TODO()).WithError(err).WithField("id", p.GetID()).Error("Failed to restore plugin")
                                return
                        }

                        if p.Rootfs != "" {
                                p.Rootfs = filepath.Join(pm.config.Root, p.PluginObj.ID, "rootfs")
                        }

                        // We should only enable rootfs propagation for certain plugin types that need it.
                        for _, typ := range p.PluginObj.Config.Interface.Types {
                                if (typ.Capability == "volumedriver" || typ.Capability == "graphdriver" || typ.Capability == "csinode" || typ.Capability == "csicontroller") && typ.Prefix == "docker" && strings.HasPrefix(typ.Version, "1.") {
                                        if p.PluginObj.Config.PropagatedMount != "" {
                                                propRoot := filepath.Join(filepath.Dir(p.Rootfs), "propagated-mount")

                                                if typ.Capability == "graphdriver" {
                                                        // TODO(thaJeztah): remove this for next release.
                                                        log.G(context.TODO()).WithError(err).WithField("dir", propRoot).Warn("skipping migrating propagated mount storage for deprecated graphdriver plugin")
                                                }

                                                // check if we need to migrate an older propagated mount from before
                                                // these mounts were stored outside the plugin rootfs
                                                if _, err := os.Stat(propRoot); os.IsNotExist(err) {
                                                        rootfsProp := filepath.Join(p.Rootfs, p.PluginObj.Config.PropagatedMount)
                                                        if _, err := os.Stat(rootfsProp); err == nil {
                                                                if err := os.Rename(rootfsProp, propRoot); err != nil {
                                                                        log.G(context.TODO()).WithError(err).WithField("dir", propRoot).Error("error migrating propagated mount storage")
                                                                }
                                                        }
                                                }

                                                if err := os.MkdirAll(propRoot, 0o755); err != nil {
                                                        log.G(context.TODO()).Errorf("failed to create PropagatedMount directory at %s: %v", propRoot, err)
                                                }
                                        }
                                }
                        }

                        pm.save(p)
                        requiresManualRestore := !pm.config.LiveRestoreEnabled && p.IsEnabled()

                        if requiresManualRestore {
                                // if liveRestore is not enabled, the plugin will be stopped now so we should enable it
                                if err := pm.enable(p, c, true); err != nil {
                                        log.G(context.TODO()).WithError(err).WithField("id", p.GetID()).Error("failed to enable plugin")
                                }
                        }
                }(p)
        }
        wg.Wait()
        return nil
}

// Get looks up the requested plugin in the store.
func (pm *Manager) Get(idOrName string) (*v2.Plugin, error) {
        return pm.config.Store.GetV2Plugin(idOrName)
}

func (pm *Manager) loadPlugin(id string) (*v2.Plugin, error) {
        p := filepath.Join(pm.config.Root, id, configFileName)
        dt, err := os.ReadFile(p)
        if err != nil {
                return nil, errors.Wrapf(err, "error reading %v", p)
        }
        var plugin v2.Plugin
        if err := json.Unmarshal(dt, &plugin); err != nil {
                return nil, errors.Wrapf(err, "error decoding %v", p)
        }
        return &plugin, nil
}

func (pm *Manager) save(p *v2.Plugin) error {
        pluginJSON, err := json.Marshal(p)
        if err != nil {
                return errors.Wrap(err, "failed to marshal plugin json")
        }
        if err := atomicwriter.WriteFile(filepath.Join(pm.config.Root, p.GetID(), configFileName), pluginJSON, 0o600); err != nil {
                return errors.Wrap(err, "failed to write atomically plugin json")
        }
        return nil
}

// GC cleans up unreferenced blobs. This is recommended to run in a goroutine
func (pm *Manager) GC() {
        pm.muGC.Lock()
        defer pm.muGC.Unlock()

        used := make(map[digest.Digest]struct{})
        for _, p := range pm.config.Store.GetAll() {
                used[p.Config] = struct{}{}
                for _, b := range p.Blobsums {
                        used[b] = struct{}{}
                }
        }

        ctx := context.TODO()
        pm.blobStore.Walk(ctx, func(info content.Info) error {
                _, ok := used[info.Digest]
                if ok {
                        return nil
                }

                return pm.blobStore.Delete(ctx, info.Digest)
        })
}

type logHook struct{ id string }

func (logHook) Levels() []log.Level {
        return []log.Level{
                log.PanicLevel,
                log.FatalLevel,
                log.ErrorLevel,
                log.WarnLevel,
                log.InfoLevel,
                log.DebugLevel,
                log.TraceLevel,
        }
}

func (l logHook) Fire(entry *log.Entry) error {
        entry.Data = log.Fields{"plugin": l.id}
        return nil
}

func makeLoggerStreams(id string) (stdout, stderr io.WriteCloser) {
        logger := logrus.New()
        logger.Hooks.Add(logHook{id})
        return logger.WriterLevel(log.InfoLevel), logger.WriterLevel(log.ErrorLevel)
}

func validatePrivileges(requiredPrivileges, privileges types.PluginPrivileges) error {
        if !isEqual(requiredPrivileges, privileges, isEqualPrivilege) {
                return errors.New("incorrect privileges")
        }

        return nil
}

func isEqual(arrOne, arrOther types.PluginPrivileges, compare func(x, y types.PluginPrivilege) bool) bool {
        if len(arrOne) != len(arrOther) {
                return false
        }

        sort.Sort(arrOne)
        sort.Sort(arrOther)

        for i := 1; i < arrOne.Len(); i++ {
                if !compare(arrOne[i], arrOther[i]) {
                        return false
                }
        }

        return true
}

func isEqualPrivilege(a, b types.PluginPrivilege) bool {
        if a.Name != b.Name {
                return false
        }

        return reflect.DeepEqual(a.Value, b.Value)
}

package plugin

import (
        "context"
        "encoding/json"
        "net"
        "os"
        "path/filepath"
        "time"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/log"
        "github.com/docker/docker/daemon/initlayer"
        v2 "github.com/docker/docker/daemon/pkg/plugin/v2"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/pkg/plugins"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/moby/api/types"
        "github.com/moby/sys/mount"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
        "golang.org/x/sys/unix"
)

func (pm *Manager) enable(p *v2.Plugin, c *controller, force bool) error {
        p.Rootfs = filepath.Join(pm.config.Root, p.PluginObj.ID, "rootfs")
        if p.IsEnabled() && !force {
                return errors.Wrap(enabledError(p.Name()), "plugin already enabled")
        }
        spec, err := p.InitSpec(pm.config.ExecRoot)
        if err != nil {
                return err
        }

        c.restart = true
        c.exitChan = make(chan bool)

        pm.mu.Lock()
        pm.cMap[p] = c
        pm.mu.Unlock()

        var propRoot string
        if p.PluginObj.Config.PropagatedMount != "" {
                propRoot = filepath.Join(filepath.Dir(p.Rootfs), "propagated-mount")

                if err := os.MkdirAll(propRoot, 0o755); err != nil {
                        log.G(context.TODO()).Errorf("failed to create PropagatedMount directory at %s: %v", propRoot, err)
                }

                if err := mount.MakeRShared(propRoot); err != nil {
                        return errors.Wrap(err, "error setting up propagated mount dir")
                }
        }

        rootFS := filepath.Join(pm.config.Root, p.PluginObj.ID, rootFSFileName)
        if err := initlayer.Setup(rootFS, 0, 0); err != nil {
                return errors.WithStack(err)
        }

        stdout, stderr := makeLoggerStreams(p.GetID())
        if err := pm.executor.Create(p.GetID(), *spec, stdout, stderr); err != nil {
                if p.PluginObj.Config.PropagatedMount != "" {
                        if err := mount.Unmount(propRoot); err != nil {
                                log.G(context.TODO()).WithField("plugin", p.Name()).WithError(err).Warn("Failed to unmount vplugin propagated mount root")
                        }
                }
                return errors.WithStack(err)
        }
        return pm.pluginPostStart(p, c)
}

func (pm *Manager) pluginPostStart(p *v2.Plugin, c *controller) error {
        sockAddr := filepath.Join(pm.config.ExecRoot, p.GetID(), p.GetSocket())
        p.SetTimeout(time.Duration(c.timeoutInSecs) * time.Second)
        addr := &net.UnixAddr{Net: "unix", Name: sockAddr}
        p.SetAddr(addr)

        if p.Protocol() == plugins.ProtocolSchemeHTTPV1 {
                client, err := plugins.NewClientWithTimeout(addr.Network()+"://"+addr.String(), nil, p.Timeout())
                if err != nil {
                        c.restart = false
                        shutdownPlugin(p, c.exitChan, pm.executor)
                        return errors.WithStack(err)
                }

                p.SetPClient(client) //nolint:staticcheck // FIXME(thaJeztah): p.SetPClient is deprecated: Hardcoded plugin client is deprecated
        }

        // Initial sleep before net Dial to allow plugin to listen on socket.
        time.Sleep(500 * time.Millisecond)
        maxRetries := 3
        var retries int
        for {
                // net dial into the unix socket to see if someone's listening.
                conn, err := net.Dial("unix", sockAddr)
                if err == nil {
                        conn.Close()
                        break
                }

                time.Sleep(3 * time.Second)
                retries++

                if retries > maxRetries {
                        log.G(context.TODO()).Debugf("error net dialing plugin: %v", err)
                        c.restart = false
                        // While restoring plugins, we need to explicitly set the state to disabled
                        pm.config.Store.SetState(p, false)
                        shutdownPlugin(p, c.exitChan, pm.executor)
                        return err
                }
        }
        pm.config.Store.SetState(p, true)
        pm.config.Store.CallHandler(p)

        return pm.save(p)
}

func (pm *Manager) restore(p *v2.Plugin, c *controller) error {
        stdout, stderr := makeLoggerStreams(p.GetID())
        alive, err := pm.executor.Restore(p.GetID(), stdout, stderr)
        if err != nil {
                return err
        }

        if pm.config.LiveRestoreEnabled {
                if !alive {
                        return pm.enable(p, c, true)
                }

                c.exitChan = make(chan bool)
                c.restart = true
                pm.mu.Lock()
                pm.cMap[p] = c
                pm.mu.Unlock()
                return pm.pluginPostStart(p, c)
        }

        if alive {
                // TODO(@cpuguy83): Should we always just re-attach to the running plugin instead of doing this?
                c.restart = false
                shutdownPlugin(p, c.exitChan, pm.executor)
        }

        return nil
}

const shutdownTimeout = 10 * time.Second

func shutdownPlugin(p *v2.Plugin, ec chan bool, executor Executor) {
        pluginID := p.GetID()

        if err := executor.Signal(pluginID, unix.SIGTERM); err != nil {
                log.G(context.TODO()).Errorf("Sending SIGTERM to plugin failed with error: %v", err)
                return
        }

        timeout := time.NewTimer(shutdownTimeout)
        defer timeout.Stop()

        select {
        case <-ec:
                log.G(context.TODO()).Debug("Clean shutdown of plugin")
        case <-timeout.C:
                log.G(context.TODO()).Debug("Force shutdown plugin")
                if err := executor.Signal(pluginID, unix.SIGKILL); err != nil {
                        log.G(context.TODO()).Errorf("Sending SIGKILL to plugin failed with error: %v", err)
                }

                timeout.Reset(shutdownTimeout)

                select {
                case <-ec:
                        log.G(context.TODO()).Debug("SIGKILL plugin shutdown")
                case <-timeout.C:
                        log.G(context.TODO()).WithField("plugin", p.Name).Warn("Force shutdown plugin FAILED")
                }
        }
}

func (pm *Manager) disable(p *v2.Plugin, c *controller) error {
        if !p.IsEnabled() {
                return errors.Wrap(errDisabled(p.Name()), "plugin is already disabled")
        }

        c.restart = false
        shutdownPlugin(p, c.exitChan, pm.executor)
        pm.config.Store.SetState(p, false)
        return pm.save(p)
}

// Shutdown stops all plugins and called during daemon shutdown.
func (pm *Manager) Shutdown() {
        plugins := pm.config.Store.GetAll()
        for _, p := range plugins {
                pm.mu.RLock()
                c := pm.cMap[p]
                pm.mu.RUnlock()

                if pm.config.LiveRestoreEnabled && p.IsEnabled() {
                        log.G(context.TODO()).Debug("Plugin active when liveRestore is set, skipping shutdown")
                        continue
                }
                if pm.executor != nil && p.IsEnabled() {
                        c.restart = false
                        shutdownPlugin(p, c.exitChan, pm.executor)
                }
        }
        if err := mount.RecursiveUnmount(pm.config.Root); err != nil {
                log.G(context.TODO()).WithError(err).Warn("error cleaning up plugin mounts")
        }
}

func (pm *Manager) upgradePlugin(p *v2.Plugin, configDigest, manifestDigest digest.Digest, blobsums []digest.Digest, tmpRootFSDir string, privileges *types.PluginPrivileges) (retErr error) {
        config, err := pm.setupNewPlugin(configDigest, privileges)
        if err != nil {
                return err
        }

        pdir := filepath.Join(pm.config.Root, p.PluginObj.ID)
        orig := filepath.Join(pdir, "rootfs")

        // Make sure nothing is mounted
        // This could happen if the plugin was disabled with `-f` with active mounts.
        // If there is anything in `orig` is still mounted, this should error out.
        if err := mount.RecursiveUnmount(orig); err != nil {
                return errdefs.System(err)
        }

        backup := orig + "-old"
        if err := os.Rename(orig, backup); err != nil {
                return errors.Wrap(errdefs.System(err), "error backing up plugin data before upgrade")
        }

        defer func() {
                if retErr != nil {
                        if err := os.RemoveAll(orig); err != nil {
                                log.G(context.TODO()).WithError(err).WithField("dir", backup).Error("error cleaning up after failed upgrade")
                                return
                        }
                        if err := os.Rename(backup, orig); err != nil {
                                retErr = errors.Wrap(err, "error restoring old plugin root on upgrade failure")
                        }
                        if err := os.RemoveAll(tmpRootFSDir); err != nil && !os.IsNotExist(err) {
                                log.G(context.TODO()).WithError(err).WithField("plugin", p.Name()).Errorf("error cleaning up plugin upgrade dir: %s", tmpRootFSDir)
                        }
                } else {
                        if err := os.RemoveAll(backup); err != nil {
                                log.G(context.TODO()).WithError(err).WithField("dir", backup).Error("error cleaning up old plugin root after successful upgrade")
                        }

                        p.Config = configDigest
                        p.Blobsums = blobsums
                }
        }()

        if err := os.Rename(tmpRootFSDir, orig); err != nil {
                return errors.Wrap(errdefs.System(err), "error upgrading")
        }

        p.PluginObj.Config = config
        p.Manifest = manifestDigest
        if err := pm.save(p); err != nil {
                return errors.Wrap(err, "error saving upgraded plugin config")
        }

        return nil
}

func (pm *Manager) setupNewPlugin(configDigest digest.Digest, privileges *types.PluginPrivileges) (types.PluginConfig, error) {
        configRA, err := pm.blobStore.ReaderAt(context.TODO(), ocispec.Descriptor{Digest: configDigest})
        if err != nil {
                return types.PluginConfig{}, err
        }
        defer configRA.Close()

        configR := content.NewReader(configRA)

        var config types.PluginConfig
        dec := json.NewDecoder(configR)
        if err := dec.Decode(&config); err != nil {
                return types.PluginConfig{}, errors.Wrapf(err, "failed to parse config")
        }
        if dec.More() {
                return types.PluginConfig{}, errors.New("invalid config json")
        }

        requiredPrivileges := computePrivileges(config)
        if privileges != nil {
                if err := validatePrivileges(requiredPrivileges, *privileges); err != nil {
                        return types.PluginConfig{}, err
                }
        }

        return config, nil
}

// createPlugin creates a new plugin. take lock before calling.
func (pm *Manager) createPlugin(name string, configDigest, manifestDigest digest.Digest, blobsums []digest.Digest, rootFSDir string, privileges *types.PluginPrivileges, opts ...CreateOpt) (_ *v2.Plugin, retErr error) {
        if err := pm.config.Store.validateName(name); err != nil { // todo: this check is wrong. remove store
                return nil, errdefs.InvalidParameter(err)
        }

        config, err := pm.setupNewPlugin(configDigest, privileges)
        if err != nil {
                return nil, err
        }

        p := &v2.Plugin{
                PluginObj: types.Plugin{
                        Name:   name,
                        ID:     stringid.GenerateRandomID(),
                        Config: config,
                },
                Config:   configDigest,
                Blobsums: blobsums,
                Manifest: manifestDigest,
        }
        p.InitEmptySettings()
        for _, o := range opts {
                o(p)
        }

        pdir := filepath.Join(pm.config.Root, p.PluginObj.ID)
        if err := os.MkdirAll(pdir, 0o700); err != nil {
                return nil, errors.Wrapf(err, "failed to mkdir %v", pdir)
        }

        defer func() {
                if retErr != nil {
                        _ = os.RemoveAll(pdir)
                }
        }()

        if err := os.Rename(rootFSDir, filepath.Join(pdir, rootFSFileName)); err != nil {
                return nil, errors.Wrap(err, "failed to rename rootfs")
        }

        if err := pm.save(p); err != nil {
                return nil, err
        }

        pm.config.Store.Add(p) // todo: remove

        return p, nil
}

func recursiveUnmount(target string) error {
        return mount.RecursiveUnmount(target)
}

package plugin

import (
        "sync"
        "time"

        "github.com/containerd/containerd/v2/core/remotes/docker"
)

func newPushJobs(tracker docker.StatusTracker) *pushJobs {
        return &pushJobs{
                names: make(map[string]string),
                t:     tracker,
        }
}

type pushJobs struct {
        t docker.StatusTracker

        mu   sync.Mutex
        jobs []string
        // maps job ref to a name
        names map[string]string
}

func (p *pushJobs) add(id, name string) {
        p.mu.Lock()
        defer p.mu.Unlock()

        if _, ok := p.names[id]; ok {
                return
        }
        p.jobs = append(p.jobs, id)
        p.names[id] = name
}

func (p *pushJobs) status() []contentStatus {
        statuses := make([]contentStatus, 0, len(p.jobs))

        p.mu.Lock()
        defer p.mu.Unlock()

        for _, j := range p.jobs {
                var s contentStatus
                s.Ref = p.names[j]

                status, err := p.t.GetStatus(j)
                if err != nil {
                        s.Status = "Waiting"
                } else {
                        s.Total = status.Total
                        s.Offset = status.Offset
                        s.StartedAt = status.StartedAt
                        s.UpdatedAt = status.UpdatedAt
                        if status.UploadUUID == "" {
                                s.Status = "Upload complete"
                        } else {
                                s.Status = "Uploading"
                        }
                }
                statuses = append(statuses, s)
        }

        return statuses
}

type contentStatus struct {
        Status    string
        Total     int64
        Offset    int64
        StartedAt time.Time
        UpdatedAt time.Time
        Ref       string
}

package plugin

import (
        "context"
        "crypto/tls"
        "net"
        "net/http"
        "time"

        "github.com/containerd/containerd/v2/core/remotes"
        "github.com/containerd/containerd/v2/core/remotes/docker"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/docker/dockerversion"
        "github.com/moby/moby/api/types/registry"
        "github.com/pkg/errors"
)

// scope builds the correct auth scope for the registry client to authorize against
// By default the client currently only does a "repository:" scope with out a classifier, e.g. "(plugin)"
// Without this, the client will not be able to authorize the request
func scope(ref reference.Named, push bool) string {
        scope := "repository(plugin):" + reference.Path(reference.TrimNamed(ref)) + ":pull"
        if push {
                scope += ",push"
        }
        return scope
}

func (pm *Manager) newResolver(ctx context.Context, tracker docker.StatusTracker, auth *registry.AuthConfig, headers http.Header, httpFallback bool) (remotes.Resolver, error) {
        if headers == nil {
                headers = http.Header{}
        }
        headers.Add("User-Agent", dockerversion.DockerUserAgent(ctx))

        return docker.NewResolver(docker.ResolverOptions{
                Tracker: tracker,
                Headers: headers,
                Hosts:   pm.registryHostsFn(auth, httpFallback),
        }), nil
}

func registryHTTPClient(config *tls.Config) *http.Client {
        return &http.Client{
                Transport: &http.Transport{
                        Proxy: http.ProxyFromEnvironment,
                        DialContext: (&net.Dialer{
                                Timeout:   30 * time.Second,
                                KeepAlive: 30 * time.Second,
                        }).DialContext,
                        TLSClientConfig:     config,
                        TLSHandshakeTimeout: 10 * time.Second,
                        IdleConnTimeout:     30 * time.Second,
                },
        }
}

func (pm *Manager) registryHostsFn(auth *registry.AuthConfig, httpFallback bool) docker.RegistryHosts {
        return func(hostname string) ([]docker.RegistryHost, error) {
                eps, err := pm.config.RegistryService.LookupPullEndpoints(hostname)
                if err != nil {
                        return nil, errors.Wrapf(err, "error resolving repository for %s", hostname)
                }

                hosts := make([]docker.RegistryHost, 0, len(eps))

                for _, ep := range eps {
                        // forced http fallback is used only for push since the containerd pusher only ever uses the first host we
                        // pass to it.
                        // So it is the callers responsibility to retry with this flag set.
                        if httpFallback && ep.URL.Scheme != "http" {
                                log.G(context.TODO()).WithField("registryHost", hostname).WithField("endpoint", ep).Debugf("Skipping non-http endpoint")
                                continue
                        }

                        caps := docker.HostCapabilityPull | docker.HostCapabilityResolve
                        if !ep.Mirror {
                                caps = caps | docker.HostCapabilityPush
                        }

                        host, err := docker.DefaultHost(ep.URL.Host)
                        if err != nil {
                                return nil, err
                        }

                        client := registryHTTPClient(ep.TLSConfig)
                        hosts = append(hosts, docker.RegistryHost{
                                Host:         host,
                                Scheme:       ep.URL.Scheme,
                                Client:       client,
                                Path:         "/v2",
                                Capabilities: caps,
                                Authorizer: docker.NewDockerAuthorizer(
                                        docker.WithAuthClient(client),
                                        docker.WithAuthCreds(func(_ string) (string, string, error) {
                                                if auth.IdentityToken != "" {
                                                        return "", auth.IdentityToken, nil
                                                }
                                                return auth.Username, auth.Password, nil
                                        }),
                                ),
                        })
                }
                log.G(context.TODO()).WithField("registryHost", hostname).WithField("hosts", hosts).Debug("Resolved registry hosts")

                return hosts, nil
        }
}

package plugin

import (
        "context"
        "fmt"
        "strings"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        v2 "github.com/docker/docker/daemon/pkg/plugin/v2"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/pkg/errors"
)

// allowV1PluginsFallback determines daemon's support for V1 plugins.
// When the time comes to remove support for V1 plugins, flipping
// this bool is all that will be needed.
const allowV1PluginsFallback = true

// defaultAPIVersion is the version of the plugin API for volume, network,
// IPAM and authz. This is a very stable API. When we update this API, then
// pluginType should include a version. e.g. "networkdriver/2.0".
const defaultAPIVersion = "1.0"

// GetV2Plugin retrieves a plugin by name, id or partial ID.
func (ps *Store) GetV2Plugin(refOrID string) (*v2.Plugin, error) {
        ps.RLock()
        defer ps.RUnlock()

        id, err := ps.resolvePluginID(refOrID)
        if err != nil {
                return nil, err
        }

        p, idOk := ps.plugins[id]
        if !idOk {
                return nil, errors.WithStack(errNotFound(id))
        }

        return p, nil
}

// validateName returns error if name is already reserved. always call with lock and full name
func (ps *Store) validateName(name string) error {
        for _, p := range ps.plugins {
                if p.Name() == name {
                        return alreadyExistsError(name)
                }
        }
        return nil
}

// GetAll retrieves all plugins.
func (ps *Store) GetAll() map[string]*v2.Plugin {
        ps.RLock()
        defer ps.RUnlock()
        return ps.plugins
}

// SetAll initialized plugins during daemon restore.
func (ps *Store) SetAll(plugins map[string]*v2.Plugin) {
        ps.Lock()
        defer ps.Unlock()

        for _, p := range plugins {
                ps.setSpecOpts(p)
        }
        ps.plugins = plugins
}

func (ps *Store) getAllByCap(capability string) []plugingetter.CompatPlugin {
        ps.RLock()
        defer ps.RUnlock()

        result := make([]plugingetter.CompatPlugin, 0, 1)
        for _, p := range ps.plugins {
                if p.IsEnabled() {
                        if _, err := p.FilterByCap(capability); err == nil {
                                result = append(result, p)
                        }
                }
        }
        return result
}

// SetState sets the active state of the plugin and updates plugindb.
func (ps *Store) SetState(p *v2.Plugin, state bool) {
        ps.Lock()
        defer ps.Unlock()

        p.PluginObj.Enabled = state
}

func (ps *Store) setSpecOpts(p *v2.Plugin) {
        var specOpts []SpecOpt
        for _, typ := range p.GetTypes() {
                opts, ok := ps.specOpts[typ.String()]
                if ok {
                        specOpts = append(specOpts, opts...)
                }
        }

        p.SetSpecOptModifier(func(s *specs.Spec) {
                for _, o := range specOpts {
                        o(s)
                }
        })
}

// Add adds a plugin to memory and plugindb.
// An error will be returned if there is a collision.
func (ps *Store) Add(p *v2.Plugin) error {
        ps.Lock()
        defer ps.Unlock()

        if v, exist := ps.plugins[p.GetID()]; exist {
                return fmt.Errorf("plugin %q has the same ID %s as %q", p.Name(), p.GetID(), v.Name())
        }

        ps.setSpecOpts(p)

        ps.plugins[p.GetID()] = p
        return nil
}

// Remove removes a plugin from memory and plugindb.
func (ps *Store) Remove(p *v2.Plugin) {
        ps.Lock()
        delete(ps.plugins, p.GetID())
        ps.Unlock()
}

// Get returns an enabled plugin matching the given name and capability.
func (ps *Store) Get(name, capability string, mode int) (plugingetter.CompatPlugin, error) {
        // Lookup using new model.
        if ps != nil {
                p, err := ps.GetV2Plugin(name)
                if err == nil {
                        if p.IsEnabled() {
                                fp, err := p.FilterByCap(capability)
                                if err != nil {
                                        return nil, err
                                }
                                p.AddRefCount(mode)
                                return fp, nil
                        }

                        // Plugin was found but it is disabled, so we should not fall back to legacy plugins
                        // but we should error out right away
                        return nil, errDisabled(name)
                }
                var ierr errNotFound
                if !errors.As(err, &ierr) {
                        return nil, err
                }
        }

        if !allowV1PluginsFallback {
                return nil, errNotFound(name)
        }

        p, err := plugins.Get(name, capability)
        if err == nil {
                return p, nil
        }
        if errors.Is(err, plugins.ErrNotFound) {
                return nil, errNotFound(name)
        }
        return nil, errors.Wrap(errdefs.System(err), "legacy plugin")
}

// GetAllManagedPluginsByCap returns a list of managed plugins matching the given capability.
func (ps *Store) GetAllManagedPluginsByCap(capability string) []plugingetter.CompatPlugin {
        return ps.getAllByCap(capability)
}

// GetAllByCap returns a list of enabled plugins matching the given capability.
func (ps *Store) GetAllByCap(capability string) ([]plugingetter.CompatPlugin, error) {
        result := make([]plugingetter.CompatPlugin, 0, 1)

        /* Daemon start always calls plugin.Init thereby initializing a store.
         * So store on experimental builds can never be nil, even while
         * handling legacy plugins. However, there are legacy plugin unit
         * tests where the volume subsystem directly talks with the plugin,
         * bypassing the daemon. For such tests, this check is necessary.
         */
        if ps != nil {
                result = ps.getAllByCap(capability)
        }

        // Lookup with legacy model
        if allowV1PluginsFallback {
                l := plugins.NewLocalRegistry()
                pl, err := l.GetAll(capability)
                if err != nil {
                        return nil, errors.Wrap(errdefs.System(err), "legacy plugin")
                }
                for _, p := range pl {
                        result = append(result, p)
                }
        }
        return result, nil
}

func pluginType(capability string) string {
        return fmt.Sprintf("docker.%s/%s", strings.ToLower(capability), defaultAPIVersion)
}

// Handle sets a callback for a given capability. It is only used by network
// and ipam drivers during plugin registration. The callback registers the
// driver with the subsystem (network, ipam).
func (ps *Store) Handle(capability string, callback func(string, *plugins.Client)) {
        typ := pluginType(capability)

        // Register callback with new plugin model.
        ps.Lock()
        handlers, ok := ps.handlers[typ]
        if !ok {
                handlers = []func(string, *plugins.Client){}
        }
        handlers = append(handlers, callback)
        ps.handlers[typ] = handlers
        ps.Unlock()

        // Register callback with legacy plugin model.
        if allowV1PluginsFallback {
                plugins.Handle(capability, callback)
        }
}

// RegisterRuntimeOpt stores a list of SpecOpts for the provided capability.
// These options are applied to the runtime spec before a plugin is started for the specified capability.
func (ps *Store) RegisterRuntimeOpt(capability string, opts ...SpecOpt) {
        ps.Lock()
        defer ps.Unlock()
        typ := pluginType(capability)
        ps.specOpts[typ] = append(ps.specOpts[typ], opts...)
}

// CallHandler calls the registered callback. It is invoked during plugin enable.
func (ps *Store) CallHandler(p *v2.Plugin) {
        for _, typ := range p.GetTypes() {
                for _, handler := range ps.handlers[typ.String()] {
                        handler(p.Name(), p.Client()) //nolint:staticcheck // FIXME(thaJeztah): p.Client is deprecated: use p.Addr() and manually create the client
                }
        }
}

// resolvePluginID must be protected by ps.RLock
func (ps *Store) resolvePluginID(idOrName string) (string, error) {
        if validFullID.MatchString(idOrName) {
                return idOrName, nil
        }

        ref, err := reference.ParseNormalizedNamed(idOrName)
        if err != nil {
                return "", errors.WithStack(errNotFound(idOrName))
        }
        if _, ok := ref.(reference.Canonical); ok {
                log.G(context.TODO()).Warnf("canonical references cannot be resolved: %v", reference.FamiliarString(ref))
                return "", errors.WithStack(errNotFound(idOrName))
        }

        ref = reference.TagNameOnly(ref)

        for _, p := range ps.plugins {
                if p.PluginObj.Name == reference.FamiliarString(ref) {
                        return p.PluginObj.ID, nil
                }
        }

        var found *v2.Plugin
        for id, p := range ps.plugins { // this can be optimized
                if strings.HasPrefix(id, idOrName) {
                        if found != nil {
                                return "", errors.WithStack(errAmbiguous(idOrName))
                        }
                        found = p
                }
        }
        if found == nil {
                return "", errors.WithStack(errNotFound(idOrName))
        }
        return found.PluginObj.ID, nil
}

package v2

import (
        "errors"
        "fmt"
        "net"
        "path/filepath"
        "strings"
        "sync"
        "time"

        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
        "github.com/moby/moby/api/types"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// Plugin represents an individual plugin.
type Plugin struct {
        mu        sync.RWMutex
        PluginObj types.Plugin `json:"plugin"` // todo: embed struct
        pClient   *plugins.Client
        refCount  int
        Rootfs    string // TODO: make private

        Config   digest.Digest
        Blobsums []digest.Digest
        Manifest digest.Digest

        modifyRuntimeSpec func(*specs.Spec)

        SwarmServiceID string
        timeout        time.Duration
        addr           net.Addr
}

const defaultPluginRuntimeDestination = "/run/docker/plugins"

// ErrInadequateCapability indicates that the plugin did not have the requested capability.
type ErrInadequateCapability struct {
        cap string
}

func (e ErrInadequateCapability) Error() string {
        return fmt.Sprintf("plugin does not provide %q capability", e.cap)
}

// ScopedPath returns the path scoped to the plugin rootfs
func (p *Plugin) ScopedPath(s string) string {
        if p.PluginObj.Config.PropagatedMount != "" && strings.HasPrefix(s, p.PluginObj.Config.PropagatedMount) {
                // re-scope to the propagated mount path on the host
                return filepath.Join(filepath.Dir(p.Rootfs), "propagated-mount", strings.TrimPrefix(s, p.PluginObj.Config.PropagatedMount))
        }
        return filepath.Join(p.Rootfs, s)
}

// Client returns the plugin client.
//
// Deprecated: use p.Addr() and manually create the client
func (p *Plugin) Client() *plugins.Client {
        p.mu.RLock()
        defer p.mu.RUnlock()

        return p.pClient
}

// SetPClient set the plugin client.
//
// Deprecated: Hardcoded plugin client is deprecated
func (p *Plugin) SetPClient(client *plugins.Client) {
        p.mu.Lock()
        defer p.mu.Unlock()

        p.pClient = client
}

// IsV1 returns true for V1 plugins and false otherwise.
func (p *Plugin) IsV1() bool {
        return false
}

// Name returns the plugin name.
func (p *Plugin) Name() string {
        return p.PluginObj.Name
}

// FilterByCap query the plugin for a given capability.
func (p *Plugin) FilterByCap(capability string) (*Plugin, error) {
        capability = strings.ToLower(capability)
        for _, typ := range p.PluginObj.Config.Interface.Types {
                if typ.Capability == capability && typ.Prefix == "docker" {
                        return p, nil
                }
        }
        return nil, ErrInadequateCapability{capability}
}

// InitEmptySettings initializes empty settings for a plugin.
func (p *Plugin) InitEmptySettings() {
        p.PluginObj.Settings.Mounts = make([]types.PluginMount, len(p.PluginObj.Config.Mounts))
        copy(p.PluginObj.Settings.Mounts, p.PluginObj.Config.Mounts)
        p.PluginObj.Settings.Devices = make([]types.PluginDevice, len(p.PluginObj.Config.Linux.Devices))
        copy(p.PluginObj.Settings.Devices, p.PluginObj.Config.Linux.Devices)
        p.PluginObj.Settings.Env = make([]string, 0, len(p.PluginObj.Config.Env))
        for _, env := range p.PluginObj.Config.Env {
                if env.Value != nil {
                        p.PluginObj.Settings.Env = append(p.PluginObj.Settings.Env, fmt.Sprintf("%s=%s", env.Name, *env.Value))
                }
        }
        p.PluginObj.Settings.Args = make([]string, len(p.PluginObj.Config.Args.Value))
        copy(p.PluginObj.Settings.Args, p.PluginObj.Config.Args.Value)
}

// Set is used to pass arguments to the plugin.
func (p *Plugin) Set(args []string) error {
        p.mu.Lock()
        defer p.mu.Unlock()

        if p.PluginObj.Enabled {
                return errors.New("cannot set on an active plugin, disable plugin before setting")
        }

        sets, err := newSettables(args)
        if err != nil {
                return err
        }

        // TODO(vieux): lots of code duplication here, needs to be refactored.

next:
        for _, set := range sets {
                s := set

                // range over all the envs in the config
                for _, env := range p.PluginObj.Config.Env {
                        // found the env in the config
                        if env.Name == s.name {
                                // is it settable ?
                                if ok, err := s.isSettable(allowedSettableFieldsEnv, env.Settable); err != nil {
                                        return err
                                } else if !ok {
                                        return fmt.Errorf("%q is not settable", s.prettyName())
                                }
                                // is it, so lets update the settings in memory
                                updateSettingsEnv(&p.PluginObj.Settings.Env, &s)
                                continue next
                        }
                }

                // range over all the mounts in the config
                for _, mount := range p.PluginObj.Config.Mounts {
                        // found the mount in the config
                        if mount.Name == s.name {
                                // is it settable ?
                                if ok, err := s.isSettable(allowedSettableFieldsMounts, mount.Settable); err != nil {
                                        return err
                                } else if !ok {
                                        return fmt.Errorf("%q is not settable", s.prettyName())
                                }

                                // it is, so lets update the settings in memory
                                if mount.Source == nil {
                                        return errors.New("Plugin config has no mount source")
                                }
                                *mount.Source = s.value
                                continue next
                        }
                }

                // range over all the devices in the config
                for _, device := range p.PluginObj.Config.Linux.Devices {
                        // found the device in the config
                        if device.Name == s.name {
                                // is it settable ?
                                if ok, err := s.isSettable(allowedSettableFieldsDevices, device.Settable); err != nil {
                                        return err
                                } else if !ok {
                                        return fmt.Errorf("%q is not settable", s.prettyName())
                                }

                                // it is, so lets update the settings in memory
                                if device.Path == nil {
                                        return errors.New("Plugin config has no device path")
                                }
                                *device.Path = s.value
                                continue next
                        }
                }

                // found the name in the config
                if p.PluginObj.Config.Args.Name == s.name {
                        // is it settable ?
                        if ok, err := s.isSettable(allowedSettableFieldsArgs, p.PluginObj.Config.Args.Settable); err != nil {
                                return err
                        } else if !ok {
                                return fmt.Errorf("%q is not settable", s.prettyName())
                        }

                        // it is, so lets update the settings in memory
                        p.PluginObj.Settings.Args = strings.Split(s.value, " ")
                        continue next
                }

                return fmt.Errorf("setting %q not found in the plugin configuration", s.name)
        }

        return nil
}

// IsEnabled returns the active state of the plugin.
func (p *Plugin) IsEnabled() bool {
        p.mu.RLock()
        defer p.mu.RUnlock()

        return p.PluginObj.Enabled
}

// GetID returns the plugin's ID.
func (p *Plugin) GetID() string {
        p.mu.RLock()
        defer p.mu.RUnlock()

        return p.PluginObj.ID
}

// GetSocket returns the plugin socket.
func (p *Plugin) GetSocket() string {
        p.mu.RLock()
        defer p.mu.RUnlock()

        return p.PluginObj.Config.Interface.Socket
}

// GetTypes returns the interface types of a plugin.
func (p *Plugin) GetTypes() []types.PluginInterfaceType {
        p.mu.RLock()
        defer p.mu.RUnlock()

        return p.PluginObj.Config.Interface.Types
}

// GetRefCount returns the reference count.
func (p *Plugin) GetRefCount() int {
        p.mu.RLock()
        defer p.mu.RUnlock()

        return p.refCount
}

// AddRefCount adds to reference count.
func (p *Plugin) AddRefCount(count int) {
        p.mu.Lock()
        defer p.mu.Unlock()

        p.refCount += count
}

// Acquire increments the plugin's reference count
// This should be followed up by `Release()` when the plugin is no longer in use.
func (p *Plugin) Acquire() {
        p.AddRefCount(plugingetter.Acquire)
}

// Release decrements the plugin's reference count
// This should only be called when the plugin is no longer in use, e.g. with
// via `Acquire()` or getter.Get("name", "type", plugingetter.Acquire)
func (p *Plugin) Release() {
        p.AddRefCount(plugingetter.Release)
}

// SetSpecOptModifier sets the function to use to modify the generated
// runtime spec.
func (p *Plugin) SetSpecOptModifier(f func(*specs.Spec)) {
        p.mu.Lock()
        p.modifyRuntimeSpec = f
        p.mu.Unlock()
}

// Timeout gets the currently configured connection timeout.
// This should be used when dialing the plugin.
func (p *Plugin) Timeout() time.Duration {
        p.mu.RLock()
        t := p.timeout
        p.mu.RUnlock()
        return t
}

// SetTimeout sets the timeout to use for dialing.
func (p *Plugin) SetTimeout(t time.Duration) {
        p.mu.Lock()
        p.timeout = t
        p.mu.Unlock()
}

// Addr returns the net.Addr to use to connect to the plugin socket
func (p *Plugin) Addr() net.Addr {
        p.mu.RLock()
        addr := p.addr
        p.mu.RUnlock()
        return addr
}

// SetAddr sets the plugin address which can be used for dialing the plugin.
func (p *Plugin) SetAddr(addr net.Addr) {
        p.mu.Lock()
        p.addr = addr
        p.mu.Unlock()
}

// Protocol is the protocol that should be used for interacting with the plugin.
func (p *Plugin) Protocol() string {
        if p.PluginObj.Config.Interface.ProtocolScheme != "" {
                return p.PluginObj.Config.Interface.ProtocolScheme
        }
        return plugins.ProtocolSchemeHTTPV1
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package v2

import (
        "os"
        "path/filepath"
        "runtime"
        "strings"

        "github.com/docker/docker/internal/rootless/mountopts"
        "github.com/docker/docker/internal/sliceutil"
        "github.com/docker/docker/oci"
        "github.com/moby/moby/api/types"
        "github.com/moby/sys/userns"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/pkg/errors"
)

// InitSpec creates an OCI spec from the plugin's config.
func (p *Plugin) InitSpec(execRoot string) (*specs.Spec, error) {
        s := oci.DefaultSpec()

        s.Root = &specs.Root{
                Path:     p.Rootfs,
                Readonly: false, // TODO: all plugins should be readonly? settable in config?
        }

        userMounts := make(map[string]struct{}, len(p.PluginObj.Settings.Mounts))
        for _, m := range p.PluginObj.Settings.Mounts {
                userMounts[m.Destination] = struct{}{}
        }

        execRoot = filepath.Join(execRoot, p.PluginObj.ID)
        if err := os.MkdirAll(execRoot, 0o700); err != nil {
                return nil, errors.WithStack(err)
        }

        if p.PluginObj.Config.PropagatedMount != "" {
                pRoot := filepath.Join(filepath.Dir(p.Rootfs), "propagated-mount")
                s.Mounts = append(s.Mounts, specs.Mount{
                        Source:      pRoot,
                        Destination: p.PluginObj.Config.PropagatedMount,
                        Type:        "bind",
                        Options:     []string{"rbind", "rw", "rshared"},
                })
                s.Linux.RootfsPropagation = "rshared"
        }

        mounts := append(p.PluginObj.Config.Mounts, types.PluginMount{
                Source:      &execRoot,
                Destination: defaultPluginRuntimeDestination,
                Type:        "bind",
                Options:     []string{"rbind", "rshared"},
        })

        if p.PluginObj.Config.Network.Type != "" {
                // TODO: if net == bridge, use libnetwork controller to create a new plugin-specific bridge, bind mount /etc/hosts and /etc/resolv.conf look at the docker code (allocateNetwork, initialize)
                if p.PluginObj.Config.Network.Type == "host" {
                        oci.RemoveNamespace(&s, specs.LinuxNamespaceType("network"))
                }
                etcHosts := "/etc/hosts"
                resolvConf := "/etc/resolv.conf"
                mounts = append(mounts,
                        types.PluginMount{
                                Source:      &etcHosts,
                                Destination: etcHosts,
                                Type:        "bind",
                                Options:     []string{"rbind", "ro"},
                        },
                        types.PluginMount{
                                Source:      &resolvConf,
                                Destination: resolvConf,
                                Type:        "bind",
                                Options:     []string{"rbind", "ro"},
                        })
        }
        if p.PluginObj.Config.PidHost {
                oci.RemoveNamespace(&s, specs.LinuxNamespaceType("pid"))
        }

        if p.PluginObj.Config.IpcHost {
                oci.RemoveNamespace(&s, specs.LinuxNamespaceType("ipc"))
        }

        for _, mnt := range mounts {
                m := specs.Mount{
                        Destination: mnt.Destination,
                        Type:        mnt.Type,
                        Options:     mnt.Options,
                }
                if mnt.Source == nil {
                        return nil, errors.New("mount source is not specified")
                }
                m.Source = *mnt.Source
                s.Mounts = append(s.Mounts, m)
        }

        for i, m := range s.Mounts {
                if strings.HasPrefix(m.Destination, "/dev/") {
                        if _, ok := userMounts[m.Destination]; ok {
                                s.Mounts = append(s.Mounts[:i], s.Mounts[i+1:]...)
                        }
                }
        }

        if p.PluginObj.Config.Linux.AllowAllDevices {
                s.Linux.Resources.Devices = []specs.LinuxDeviceCgroup{{Allow: true, Access: "rwm"}}
        }
        for _, dev := range p.PluginObj.Settings.Devices {
                path := *dev.Path
                d, dPermissions, err := oci.DevicesFromPath(path, path, "rwm")
                if err != nil {
                        return nil, errors.WithStack(err)
                }
                s.Linux.Devices = append(s.Linux.Devices, d...)
                s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, dPermissions...)
        }

        envs := make([]string, 1, len(p.PluginObj.Settings.Env)+1)
        envs[0] = "PATH=" + oci.DefaultPathEnv(runtime.GOOS)
        envs = append(envs, p.PluginObj.Settings.Env...)

        args := append(p.PluginObj.Config.Entrypoint, p.PluginObj.Settings.Args...)
        cwd := p.PluginObj.Config.WorkDir
        if cwd == "" {
                cwd = "/"
        }
        s.Process.Terminal = false
        s.Process.Args = args
        s.Process.Cwd = cwd
        s.Process.Env = envs

        caps := s.Process.Capabilities
        caps.Bounding = append(caps.Bounding, p.PluginObj.Config.Linux.Capabilities...)
        caps.Permitted = append(caps.Permitted, p.PluginObj.Config.Linux.Capabilities...)
        caps.Inheritable = append(caps.Inheritable, p.PluginObj.Config.Linux.Capabilities...)
        caps.Effective = append(caps.Effective, p.PluginObj.Config.Linux.Capabilities...)

        if p.modifyRuntimeSpec != nil {
                p.modifyRuntimeSpec(&s)
        }

        // Rootless mode requires modifying the mount flags
        // https://github.com/moby/moby/issues/47248#issuecomment-1927776700
        // https://github.com/moby/moby/pull/47558
        if userns.RunningInUserNS() {
                for i := range s.Mounts {
                        m := &s.Mounts[i]
                        for _, o := range m.Options {
                                switch o {
                                case "bind", "rbind":
                                        if _, err := os.Lstat(m.Source); err != nil {
                                                if errors.Is(err, os.ErrNotExist) {
                                                        continue
                                                }
                                                return nil, err
                                        }
                                        // UnprivilegedMountFlags gets the set of mount flags that are set on the mount that contains the given
                                        // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
                                        // bind-mounting "with options" will not fail with user namespaces, due to
                                        // kernel restrictions that require user namespace mounts to preserve
                                        // CL_UNPRIVILEGED locked flags.
                                        unpriv, err := mountopts.UnprivilegedMountFlags(m.Source)
                                        if err != nil {
                                                return nil, errors.Wrapf(err, "failed to get unprivileged mount flags for %+v", m)
                                        }
                                        m.Options = sliceutil.Dedup(append(m.Options, unpriv...))
                                }
                        }
                }
        }

        return &s, nil
}

package v2

import (
        "errors"
        "fmt"
        "strings"
)

type settable struct {
        name  string
        field string
        value string
}

var (
        allowedSettableFieldsEnv     = []string{"value"}
        allowedSettableFieldsArgs    = []string{"value"}
        allowedSettableFieldsDevices = []string{"path"}
        allowedSettableFieldsMounts  = []string{"source"}

        errMultipleFields = errors.New("multiple fields are settable, one must be specified")
        errInvalidFormat  = errors.New("invalid format, must be <name>[.<field>][=<value>]")
)

func newSettables(args []string) ([]settable, error) {
        sets := make([]settable, 0, len(args))
        for _, arg := range args {
                set, err := newSettable(arg)
                if err != nil {
                        return nil, err
                }
                sets = append(sets, set)
        }
        return sets, nil
}

func newSettable(arg string) (settable, error) {
        var set settable
        if i := strings.Index(arg, "="); i == 0 {
                return set, errInvalidFormat
        } else if i < 0 {
                set.name = arg
        } else {
                set.name = arg[:i]
                set.value = arg[i+1:]
        }

        if i := strings.LastIndex(set.name, "."); i > 0 {
                set.field = set.name[i+1:]
                set.name = arg[:i]
        }

        return set, nil
}

// prettyName return name.field if there is a field, otherwise name.
func (set *settable) prettyName() string {
        if set.field != "" {
                return fmt.Sprintf("%s.%s", set.name, set.field)
        }
        return set.name
}

func (set *settable) isSettable(allowedSettableFields []string, settable []string) (bool, error) {
        if set.field == "" {
                if len(settable) == 1 {
                        // if field is not specified and there only one settable, default to it.
                        set.field = settable[0]
                } else if len(settable) > 1 {
                        return false, errMultipleFields
                }
        }

        isAllowed := false
        for _, allowedSettableField := range allowedSettableFields {
                if set.field == allowedSettableField {
                        isAllowed = true
                        break
                }
        }

        if isAllowed {
                for _, settableField := range settable {
                        if set.field == settableField {
                                return true, nil
                        }
                }
        }

        return false, nil
}

func updateSettingsEnv(env *[]string, set *settable) {
        for i, e := range *env {
                if name, _, _ := strings.Cut(e, "="); name == set.name {
                        (*env)[i] = set.name + "=" + set.value
                        return
                }
        }

        *env = append(*env, set.name+"="+set.value)
}

package mounts

import (
        "errors"
        "fmt"
        "path"
        "strings"

        "github.com/docker/docker/internal/lazyregexp"
        "github.com/moby/moby/api/types/mount"
)

// NewLCOWParser creates a parser with Linux Containers on Windows semantics.
func NewLCOWParser() Parser {
        return &lcowParser{
                windowsParser{
                        fi: defaultFileInfoProvider{},
                },
        }
}

// rxLCOWDestination is the regex expression for the mount destination for LCOW
//
// Destination (aka container path):
//   - Variation on hostdir but can be a drive followed by colon as well
//   - If a path, must be absolute. Can include spaces
//   - Drive cannot be c: (explicitly checked in code, not RegEx)
const rxLCOWDestination = `(?P<destination>/(?:[^\\/:*?"<>\r\n]+[/]?)*)`

var (
        lcowMountDestinationRegex = lazyregexp.New(`^` + rxLCOWDestination + `$`)
        lcowSplitRawSpec          = lazyregexp.New(`^` + rxSource + rxLCOWDestination + rxMode + `$`)
)

var lcowValidators mountValidator = func(m *mount.Mount) error {
        if path.Clean(m.Target) == "/" {
                return ErrVolumeTargetIsRoot
        }
        if m.Type == mount.TypeNamedPipe {
                return errors.New("Linux containers on Windows do not support named pipe mounts")
        }
        if !lcowMountDestinationRegex.MatchString(strings.ToLower(m.Target)) {
                return fmt.Errorf("invalid mount path: '%s'", m.Target)
        }
        return nil
}

type lcowParser struct {
        windowsParser
}

func (p *lcowParser) ValidateMountConfig(mnt *mount.Mount) error {
        return p.validateMountConfigReg(mnt, lcowValidators)
}

func (p *lcowParser) ParseMountRaw(raw, volumeDriver string) (*MountPoint, error) {
        arr, err := p.splitRawSpec(raw, lcowSplitRawSpec)
        if err != nil {
                return nil, err
        }
        return p.parseMount(arr, raw, volumeDriver, false, lcowValidators)
}

func (p *lcowParser) ParseMountSpec(cfg mount.Mount) (*MountPoint, error) {
        return p.parseMountSpec(cfg, false, lcowValidators)
}

package mounts

import (
        "errors"
        "fmt"
        "path"
        "path/filepath"
        "strings"

        "github.com/docker/docker/daemon/volume"
        "github.com/moby/moby/api/types/mount"
)

// NewLinuxParser creates a parser with Linux semantics.
func NewLinuxParser() Parser {
        return &linuxParser{
                fi: defaultFileInfoProvider{},
        }
}

type linuxParser struct {
        fi fileInfoProvider
}

func linuxValidateNotRoot(p string) error {
        p = path.Clean(strings.ReplaceAll(p, `\`, `/`))
        if p == "/" {
                return ErrVolumeTargetIsRoot
        }
        return nil
}

func linuxValidateAbsolute(p string) error {
        p = strings.ReplaceAll(p, `\`, `/`)
        if path.IsAbs(p) {
                return nil
        }
        return fmt.Errorf("invalid mount path: '%s' mount path must be absolute", p)
}

func (p *linuxParser) ValidateMountConfig(mnt *mount.Mount) error {
        // there was something looking like a bug in existing codebase:
        // - validateMountConfig on linux was called with options skipping bind source existence when calling ParseMountRaw
        // - but not when calling ParseMountSpec directly... nor when the unit test called it directly
        return p.validateMountConfigImpl(mnt, true)
}

func (p *linuxParser) validateMountConfigImpl(mnt *mount.Mount, validateBindSourceExists bool) error {
        if mnt.Target == "" {
                return &errMountConfig{mnt, errMissingField("Target")}
        }

        if err := linuxValidateNotRoot(mnt.Target); err != nil {
                return &errMountConfig{mnt, err}
        }

        if err := linuxValidateAbsolute(mnt.Target); err != nil {
                return &errMountConfig{mnt, err}
        }

        switch mnt.Type {
        case mount.TypeBind:
                if mnt.Source == "" {
                        return &errMountConfig{mnt, errMissingField("Source")}
                }
                // Don't error out just because the propagation mode is not supported on the platform
                if opts := mnt.BindOptions; opts != nil {
                        if len(opts.Propagation) > 0 && len(linuxPropagationModes) > 0 {
                                if _, ok := linuxPropagationModes[opts.Propagation]; !ok {
                                        return &errMountConfig{mnt, fmt.Errorf("invalid propagation mode: %s", opts.Propagation)}
                                }
                        }
                }
                if mnt.VolumeOptions != nil {
                        return &errMountConfig{mnt, errExtraField("VolumeOptions")}
                }
                if mnt.ImageOptions != nil {
                        return &errMountConfig{mnt, errExtraField("ImageOptions")}
                }

                if err := linuxValidateAbsolute(mnt.Source); err != nil {
                        return &errMountConfig{mnt, err}
                }

                if validateBindSourceExists {
                        exists, _, err := p.fi.fileInfo(mnt.Source)
                        if err != nil {
                                return &errMountConfig{mnt, err}
                        }

                        createMountpoint := mnt.BindOptions != nil && mnt.BindOptions.CreateMountpoint
                        if !exists && !createMountpoint {
                                return &errMountConfig{mnt, errBindSourceDoesNotExist(mnt.Source)}
                        }
                }

        case mount.TypeVolume:
                if mnt.BindOptions != nil {
                        return &errMountConfig{mnt, errExtraField("BindOptions")}
                }
                if mnt.ImageOptions != nil {
                        return &errMountConfig{mnt, errExtraField("ImageOptions")}
                }
                anonymousVolume := mnt.Source == ""

                if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
                        if anonymousVolume {
                                return &errMountConfig{mnt, errAnonymousVolumeWithSubpath}
                        }

                        if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
                                return &errMountConfig{mnt, errInvalidSubpath}
                        }
                }
                if mnt.ReadOnly && anonymousVolume {
                        return &errMountConfig{mnt, errors.New("must not set ReadOnly mode when using anonymous volumes")}
                }
        case mount.TypeTmpfs:
                if mnt.BindOptions != nil {
                        return &errMountConfig{mnt, errExtraField("BindOptions")}
                }
                if mnt.ImageOptions != nil {
                        return &errMountConfig{mnt, errExtraField("ImageOptions")}
                }
                if mnt.Source != "" {
                        return &errMountConfig{mnt, errExtraField("Source")}
                }
                if _, err := p.ConvertTmpfsOptions(mnt.TmpfsOptions, mnt.ReadOnly); err != nil {
                        return &errMountConfig{mnt, err}
                }
        case mount.TypeImage:
                if mnt.BindOptions != nil {
                        return &errMountConfig{mnt, errExtraField("BindOptions")}
                }
                if mnt.VolumeOptions != nil {
                        return &errMountConfig{mnt, errExtraField("VolumeOptions")}
                }
                if mnt.Source == "" {
                        return &errMountConfig{mnt, errMissingField("Source")}
                }
                if mnt.ImageOptions != nil && mnt.ImageOptions.Subpath != "" {
                        if !filepath.IsLocal(mnt.ImageOptions.Subpath) {
                                return &errMountConfig{mnt, errInvalidSubpath}
                        }
                }
        default:
                return &errMountConfig{mnt, errors.New("mount type unknown")}
        }
        return nil
}

// label modes
var linuxLabelModes = map[string]bool{
        "Z": true,
        "z": true,
}

// consistency modes
var linuxConsistencyModes = map[mount.Consistency]bool{
        mount.ConsistencyFull:      true,
        mount.ConsistencyCached:    true,
        mount.ConsistencyDelegated: true,
}

var linuxPropagationModes = map[mount.Propagation]bool{
        mount.PropagationPrivate:  true,
        mount.PropagationRPrivate: true,
        mount.PropagationSlave:    true,
        mount.PropagationRSlave:   true,
        mount.PropagationShared:   true,
        mount.PropagationRShared:  true,
}

const linuxDefaultPropagationMode = mount.PropagationRPrivate

func linuxGetPropagation(mode string) mount.Propagation {
        for _, o := range strings.Split(mode, ",") {
                prop := mount.Propagation(o)
                if linuxPropagationModes[prop] {
                        return prop
                }
        }
        return linuxDefaultPropagationMode
}

func linuxHasPropagation(mode string) bool {
        for _, o := range strings.Split(mode, ",") {
                if linuxPropagationModes[mount.Propagation(o)] {
                        return true
                }
        }
        return false
}

func linuxValidMountMode(mode string) bool {
        if mode == "" {
                return true
        }

        rwModeCount := 0
        labelModeCount := 0
        propagationModeCount := 0
        copyModeCount := 0
        consistencyModeCount := 0

        for _, o := range strings.Split(mode, ",") {
                switch {
                case rwModes[o]:
                        rwModeCount++
                case linuxLabelModes[o]:
                        labelModeCount++
                case linuxPropagationModes[mount.Propagation(o)]:
                        propagationModeCount++
                case copyModeExists(o):
                        copyModeCount++
                case linuxConsistencyModes[mount.Consistency(o)]:
                        consistencyModeCount++
                default:
                        return false
                }
        }

        // Only one string for each mode is allowed.
        if rwModeCount > 1 || labelModeCount > 1 || propagationModeCount > 1 || copyModeCount > 1 || consistencyModeCount > 1 {
                return false
        }
        return true
}

var validTmpfsOptions = map[string]bool{
        "exec":   true,
        "noexec": true,
}

func validateTmpfsOptions(rawOptions [][]string) ([]string, error) {
        var options []string
        for _, opt := range rawOptions {
                if len(opt) < 1 || len(opt) > 2 {
                        return nil, errors.New("invalid option array length")
                }
                if _, ok := validTmpfsOptions[opt[0]]; !ok {
                        return nil, errors.New("invalid option: " + opt[0])
                }

                if len(opt) == 1 {
                        options = append(options, opt[0])
                } else {
                        options = append(options, fmt.Sprintf("%s=%s", opt[0], opt[1]))
                }
        }
        return options, nil
}

func (p *linuxParser) ReadWrite(mode string) bool {
        if !linuxValidMountMode(mode) {
                return false
        }

        for _, o := range strings.Split(mode, ",") {
                if o == "ro" {
                        return false
                }
        }
        return true
}

func (p *linuxParser) ParseMountRaw(raw, volumeDriver string) (*MountPoint, error) {
        arr := strings.SplitN(raw, ":", 4)
        if arr[0] == "" {
                return nil, errInvalidSpec(raw)
        }

        var spec mount.Mount
        var mode string
        switch len(arr) {
        case 1:
                // Just a destination path in the container
                spec.Target = arr[0]
        case 2:
                if linuxValidMountMode(arr[1]) {
                        // Destination + Mode is not a valid volume - volumes
                        // cannot include a mode. e.g. /foo:rw
                        return nil, errInvalidSpec(raw)
                }
                // Host Source Path or Name + Destination
                spec.Source = arr[0]
                spec.Target = arr[1]
        case 3:
                // HostSourcePath+DestinationPath+Mode
                spec.Source = arr[0]
                spec.Target = arr[1]
                mode = arr[2]
        default:
                return nil, errInvalidSpec(raw)
        }

        if !linuxValidMountMode(mode) {
                return nil, errInvalidMode(mode)
        }

        if path.IsAbs(spec.Source) {
                spec.Type = mount.TypeBind
        } else {
                spec.Type = mount.TypeVolume
        }

        spec.ReadOnly = !p.ReadWrite(mode)

        // cannot assume that if a volume driver is passed in that we should set it
        if volumeDriver != "" && spec.Type == mount.TypeVolume {
                spec.VolumeOptions = &mount.VolumeOptions{
                        DriverConfig: &mount.Driver{Name: volumeDriver},
                }
        }

        if copyData, isSet := getCopyMode(mode, p.DefaultCopyMode()); isSet {
                if spec.VolumeOptions == nil {
                        spec.VolumeOptions = &mount.VolumeOptions{}
                }
                spec.VolumeOptions.NoCopy = !copyData
        }
        if linuxHasPropagation(mode) {
                spec.BindOptions = &mount.BindOptions{
                        Propagation: linuxGetPropagation(mode),
                }
        }

        mp, err := p.parseMountSpec(spec, false)
        if mp != nil {
                mp.Mode = mode
        }
        if err != nil {
                err = fmt.Errorf("%v: %v", errInvalidSpec(raw), err)
        }
        return mp, err
}

func (p *linuxParser) ParseMountSpec(cfg mount.Mount) (*MountPoint, error) {
        return p.parseMountSpec(cfg, true)
}

func (p *linuxParser) parseMountSpec(cfg mount.Mount, validateBindSourceExists bool) (*MountPoint, error) {
        if err := p.validateMountConfigImpl(&cfg, validateBindSourceExists); err != nil {
                return nil, err
        }
        mp := &MountPoint{
                RW:          !cfg.ReadOnly,
                Destination: path.Clean(filepath.ToSlash(cfg.Target)),
                Type:        cfg.Type,
                Spec:        cfg,
        }

        switch cfg.Type {
        case mount.TypeVolume:
                if cfg.Source != "" {
                        // non-anonymous volume
                        mp.Name = cfg.Source
                }
                mp.CopyData = p.DefaultCopyMode()

                if cfg.VolumeOptions != nil {
                        if cfg.VolumeOptions.DriverConfig != nil {
                                mp.Driver = cfg.VolumeOptions.DriverConfig.Name
                        }
                        if cfg.VolumeOptions.NoCopy {
                                mp.CopyData = false
                        }
                }
        case mount.TypeBind:
                mp.Source = path.Clean(filepath.ToSlash(cfg.Source))
                if cfg.BindOptions != nil && len(cfg.BindOptions.Propagation) > 0 {
                        mp.Propagation = cfg.BindOptions.Propagation
                } else {
                        // If user did not specify a propagation mode, get
                        // default propagation mode.
                        mp.Propagation = linuxDefaultPropagationMode
                }
        case mount.TypeTmpfs:
                // NOP
        case mount.TypeImage:
                mp.Source = cfg.Source
                if cfg.BindOptions != nil && len(cfg.BindOptions.Propagation) > 0 {
                        mp.Propagation = cfg.BindOptions.Propagation
                } else {
                        // If user did not specify a propagation mode, get
                        // default propagation mode.
                        mp.Propagation = linuxDefaultPropagationMode
                }
        default:
                // TODO(thaJeztah): make switch exhaustive: anything to do for mount.TypeNamedPipe, mount.TypeCluster ?
        }
        return mp, nil
}

func (p *linuxParser) ParseVolumesFrom(spec string) (string, string, error) {
        if spec == "" {
                return "", "", errors.New("volumes-from specification cannot be an empty string")
        }

        id, mode, _ := strings.Cut(spec, ":")
        if mode == "" {
                return id, "rw", nil
        }
        if !linuxValidMountMode(mode) {
                return "", "", errInvalidMode(mode)
        }
        // For now don't allow propagation properties while importing
        // volumes from data container. These volumes will inherit
        // the same propagation property as of the original volume
        // in data container. This probably can be relaxed in future.
        if linuxHasPropagation(mode) {
                return "", "", errInvalidMode(mode)
        }
        // Do not allow copy modes on volumes-from
        if _, isSet := getCopyMode(mode, p.DefaultCopyMode()); isSet {
                return "", "", errInvalidMode(mode)
        }
        return id, mode, nil
}

func (p *linuxParser) DefaultPropagationMode() mount.Propagation {
        return linuxDefaultPropagationMode
}

func (p *linuxParser) ConvertTmpfsOptions(opt *mount.TmpfsOptions, readOnly bool) (string, error) {
        var rawOpts []string
        if readOnly {
                rawOpts = append(rawOpts, "ro")
        }

        if opt != nil && opt.Mode != 0 {
                rawOpts = append(rawOpts, fmt.Sprintf("mode=%o", opt.Mode))
        }

        if opt != nil && opt.SizeBytes != 0 {
                // calculate suffix here, making this linux specific, but that is
                // okay, since API is that way anyways.

                // we do this by finding the suffix that divides evenly into the
                // value, returning the value itself, with no suffix, if it fails.
                //
                // For the most part, we don't enforce any semantic to this values.
                // The operating system will usually align this and enforce minimum
                // and maximums.
                var (
                        size   = opt.SizeBytes
                        suffix string
                )
                for _, r := range []struct {
                        suffix  string
                        divisor int64
                }{
                        {"g", 1 << 30},
                        {"m", 1 << 20},
                        {"k", 1 << 10},
                } {
                        if size%r.divisor == 0 {
                                size = size / r.divisor
                                suffix = r.suffix
                                break
                        }
                }

                rawOpts = append(rawOpts, fmt.Sprintf("size=%d%s", size, suffix))
        }

        if opt != nil && len(opt.Options) > 0 {
                tmpfsOpts, err := validateTmpfsOptions(opt.Options)
                if err != nil {
                        return "", err
                }
                rawOpts = append(rawOpts, tmpfsOpts...)
        }

        return strings.Join(rawOpts, ","), nil
}

func (p *linuxParser) DefaultCopyMode() bool {
        return true
}

func (p *linuxParser) ValidateVolumeName(name string) error {
        return nil
}

func (p *linuxParser) IsBackwardCompatible(m *MountPoint) bool {
        return m.Source != "" || m.Driver == volume.DefaultDriverName
}

func (p *linuxParser) ValidateTmpfsMountDestination(dest string) error {
        if err := linuxValidateNotRoot(dest); err != nil {
                return err
        }
        return linuxValidateAbsolute(dest)
}

package mounts

import (
        "context"
        "path/filepath"
        "runtime/debug"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/volume"
        "github.com/docker/docker/daemon/volume/safepath"
        "github.com/docker/docker/pkg/idtools"
        "github.com/docker/docker/pkg/stringid"
        mounttypes "github.com/moby/moby/api/types/mount"
        "github.com/moby/sys/user"
        "github.com/opencontainers/selinux/go-selinux/label"
        "github.com/pkg/errors"
)

// RWLayer represents a writable layer.
type RWLayer interface {
        // Mount mounts the RWLayer and returns the filesystem path
        // to the writable layer.
        Mount(mountLabel string) (string, error)

        // Unmount unmounts the RWLayer. This should be called
        // for every mount. If there are multiple mount calls
        // this operation will only decrement the internal mount counter.
        Unmount() error

        // Metadata returns the low level metadata for the mutable layer
        Metadata() (map[string]string, error)
}

// MountPoint is the intersection point between a volume and a container. It
// specifies which volume is to be used and where inside a container it should
// be mounted.
//
// Note that this type is embedded in `container.Container` object and persisted to disk.
// Changes to this struct need to by synced with on disk state.
type MountPoint struct {
        // Source is the source path of the mount.
        // E.g. `mount --bind /foo /bar`, `/foo` is the `Source`.
        Source string
        // Destination is the path relative to the container root (`/`) to the mount point
        // It is where the `Source` is mounted to
        Destination string
        // RW is set to true when the mountpoint should be mounted as read-write
        RW bool
        // Name is the name reference to the underlying data defined by `Source`
        // e.g., the volume name
        Name string
        // Driver is the volume driver used to create the volume (if it is a volume)
        Driver string
        // Type of mount to use, see `Type<foo>` definitions in github.com/docker/docker/api/types/mount
        Type mounttypes.Type `json:",omitempty"`
        // Volume is the volume providing data to this mountpoint.
        // This is nil unless `Type` is set to `TypeVolume`
        Volume volume.Volume `json:"-"`

        // Mode is the comma separated list of options supplied by the user when creating
        // the bind/volume mount.
        // Note Mode is not used on Windows
        Mode string `json:"Relabel,omitempty"` // Originally field was `Relabel`"

        // Propagation describes how the mounts are propagated from the host into the
        // mount point, and vice-versa.
        // See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
        // Note Propagation is not used on Windows
        Propagation mounttypes.Propagation `json:",omitempty"` // Mount propagation string

        // Specifies if data should be copied from the container before the first mount
        // Use a pointer here so we can tell if the user set this value explicitly
        // This allows us to error out when the user explicitly enabled copy but we can't copy due to the volume being populated
        CopyData bool `json:"-"`
        // ID is the opaque ID used to pass to the volume driver.
        // This should be set by calls to `Mount` and unset by calls to `Unmount`
        ID string `json:",omitempty"`

        // Spec is a copy of the API request that created this mount.
        Spec mounttypes.Mount

        // Some bind mounts should not be automatically created.
        // (Some are auto-created for backwards-compatibility)
        // This is checked on the API but setting this here prevents race conditions.
        // where a bind dir existed during validation was removed before reaching the setup code.
        SkipMountpointCreation bool

        // Track usage of this mountpoint
        // Specifically needed for containers which are running and calls to `docker cp`
        // because both these actions require mounting the volumes.
        active int

        // SafePaths created by Setup that should be cleaned up before unmounting
        // the volume.
        safePaths []*safepath.SafePath

        Layer RWLayer `json:"-"`
}

// Cleanup frees resources used by the mountpoint and cleans up all the paths
// returned by Setup that hasn't been cleaned up by the caller.
func (m *MountPoint) Cleanup(ctx context.Context) error {
        if m.Volume == nil || m.ID == "" {
                return nil
        }

        logger := log.G(ctx).WithFields(log.Fields{"active": m.active, "id": m.ID})

        // TODO: Remove once the real bug is fixed: https://github.com/moby/moby/issues/46508
        if m.active == 0 {
                logger.Error("An attempt to decrement a zero mount count")
                logger.Error(string(debug.Stack()))
                return nil
        }

        for _, p := range m.safePaths {
                if !p.IsValid() {
                        continue
                }

                err := p.Close(ctx)
                base, sub := p.SourcePath()
                log.G(ctx).WithFields(log.Fields{
                        "error":         err,
                        "path":          p.Path(),
                        "sourceBase":    base,
                        "sourceSubpath": sub,
                }).Warn("cleaning up SafePath that hasn't been cleaned up by the caller")
        }

        if err := m.Volume.Unmount(m.ID); err != nil {
                return errors.Wrapf(err, "error unmounting volume %s", m.Volume.Name())
        }

        m.active--
        logger.Debug("MountPoint.Cleanup Decrement active count")

        if m.active == 0 {
                m.ID = ""
        }
        return nil
}

// Setup sets up a mount point by either mounting the volume if it is
// configured, or creating the source directory if supplied.
// The, optional, checkFun parameter allows doing additional checking
// before creating the source directory on the host.
//
// The returned path can be a temporary path, caller is responsible to
// call the returned cleanup function as soon as the path is not needed.
// Cleanup doesn't unmount the underlying volumes (if any), it only
// frees up the resources that were needed to guarantee that the path
// still points to the same target (to avoid TOCTOU attack).
//
// Cleanup function doesn't need to be called when error is returned.
func (m *MountPoint) Setup(ctx context.Context, mountLabel string, rootIDs idtools.Identity, checkFun func(m *MountPoint) error) (mountPath string, cleanup func(context.Context) error, retErr error) {
        if m.SkipMountpointCreation {
                return m.Source, noCleanup, nil
        }

        defer func() {
                if retErr != nil || !label.RelabelNeeded(m.Mode) {
                        return
                }

                sourcePath, err := filepath.EvalSymlinks(mountPath)
                if err != nil {
                        mountPath = ""
                        retErr = errors.Wrapf(err, "error evaluating symlinks from mount source %q", m.Source)
                        if cleanupErr := cleanup(ctx); cleanupErr != nil {
                                log.G(ctx).WithError(cleanupErr).Warn("failed to cleanup after error")
                        }
                        cleanup = noCleanup
                        return
                }
                err = label.Relabel(sourcePath, mountLabel, label.IsShared(m.Mode))
                if err != nil && !errors.Is(err, syscall.ENOTSUP) {
                        mountPath = ""
                        retErr = errors.Wrapf(err, "error setting label on mount source '%s'", sourcePath)
                        if cleanupErr := cleanup(ctx); cleanupErr != nil {
                                log.G(ctx).WithError(cleanupErr).Warn("failed to cleanup after error")
                        }
                        cleanup = noCleanup
                }
        }()

        if m.Volume != nil {
                id := m.ID
                if id == "" {
                        id = stringid.GenerateRandomID()
                }
                volumePath, err := m.Volume.Mount(id)
                if err != nil {
                        return "", noCleanup, errors.Wrapf(err, "error while mounting volume '%s'", m.Source)
                }

                m.ID = id
                clean := noCleanup
                if m.Spec.VolumeOptions != nil && m.Spec.VolumeOptions.Subpath != "" {
                        subpath := m.Spec.VolumeOptions.Subpath

                        safePath, err := safepath.Join(ctx, volumePath, subpath)
                        if err != nil {
                                if err := m.Volume.Unmount(id); err != nil {
                                        log.G(ctx).WithError(err).Error("failed to unmount after safepath.Join failed")
                                }
                                return "", noCleanup, err
                        }
                        m.safePaths = append(m.safePaths, safePath)
                        log.G(ctx).Debugf("mounting (%s|%s) via %s", volumePath, subpath, safePath.Path())

                        clean = safePath.Close
                        volumePath = safePath.Path()
                }

                m.active++
                return volumePath, clean, nil
        }

        if m.Type == mounttypes.TypeImage {
                if m.Spec.ImageOptions != nil && m.Spec.ImageOptions.Subpath != "" {
                        subpath := m.Spec.ImageOptions.Subpath

                        safePath, err := safepath.Join(ctx, m.Source, subpath)
                        if err != nil {
                                return "", noCleanup, err
                        }
                        m.safePaths = append(m.safePaths, safePath)
                        log.G(ctx).Debugf("mounting (%s|%s) via %s", m.Source, subpath, safePath.Path())
                        return safePath.Path(), safePath.Close, nil
                }
        }

        if m.Source == "" {
                return "", noCleanup, errors.New("Unable to setup mount point, neither source nor volume defined")
        }

        if m.Type == mounttypes.TypeBind {
                // Before creating the source directory on the host, invoke checkFun if it's not nil. One of
                // the use case is to forbid creating the daemon socket as a directory if the daemon is in
                // the process of shutting down.
                if checkFun != nil {
                        if err := checkFun(m); err != nil {
                                return "", noCleanup, err
                        }
                }

                // Auto-create directories on the host if they're missing. Note that
                // this is a best-effort; newly created directories are created with
                // the correct (remapped) rootUID/rootGID ownership, but existing
                // directories are not chown'ed.
                //
                // This also defaults to assuming the host-path was intended to be a
                // directory; user.MkdirAllAndChown produces an error if the
                // path exists but is a file (not a directory). We ignore this case,
                // but an error may occur if the destination path inside the container
                // is a directory (cannot bind-mount a file on a directory and vice-versa).
                if err := user.MkdirAllAndChown(m.Source, 0o755, rootIDs.UID, rootIDs.GID, user.WithOnlyNew); err != nil {
                        if !errors.Is(err, syscall.ENOTDIR) {
                                return "", noCleanup, errors.Wrapf(err, "error while creating mount source path '%s'", m.Source)
                        }
                }
        }
        return m.Source, noCleanup, nil
}

func (m *MountPoint) LiveRestore(ctx context.Context) error {
        if m.Volume == nil {
                log.G(ctx).Debug("No volume to restore")
                return nil
        }

        lrv, ok := m.Volume.(volume.LiveRestorer)
        if !ok {
                log.G(ctx).WithField("volume", m.Volume.Name()).Debugf("Volume does not support live restore: %T", m.Volume)
                return nil
        }

        id := m.ID
        if id == "" {
                id = stringid.GenerateRandomID()
        }

        if err := lrv.LiveRestoreVolume(ctx, id); err != nil {
                return errors.Wrapf(err, "error while restoring volume '%s'", m.Source)
        }

        m.ID = id
        m.active++
        return nil
}

// Path returns the path of a volume in a mount point.
func (m *MountPoint) Path() string {
        if m.Volume != nil {
                return m.Volume.Path()
        }
        return m.Source
}

func errInvalidMode(mode string) error {
        return errors.Errorf("invalid mode: %v", mode)
}

func errInvalidSpec(spec string) error {
        return errors.Errorf("invalid volume specification: '%s'", spec)
}

// noCleanup is a no-op cleanup function.
func noCleanup(_ context.Context) error {
        return nil
}

package mounts

import (
        "errors"
        "runtime"

        "github.com/moby/moby/api/types/mount"
)

// ErrVolumeTargetIsRoot is returned when the target destination is root.
// It's used by both LCOW and Linux parsers.
var ErrVolumeTargetIsRoot = errors.New("invalid specification: destination can't be '/'")

// errAnonymousVolumeWithSubpath is returned when Subpath is specified for
// anonymous volume.
var errAnonymousVolumeWithSubpath = errors.New("must not set Subpath when using anonymous volumes")

// errInvalidSubpath is returned when the provided Subpath is not lexically an
// relative path within volume.
var errInvalidSubpath = errors.New("subpath must be a relative path within the volume")

// read-write modes
var rwModes = map[string]bool{
        "rw": true,
        "ro": true, // attempts recursive read-only if possible
}

// Parser represents a platform specific parser for mount expressions
type Parser interface {
        ParseMountRaw(raw, volumeDriver string) (*MountPoint, error)
        ParseMountSpec(cfg mount.Mount) (*MountPoint, error)
        ParseVolumesFrom(spec string) (string, string, error)
        DefaultPropagationMode() mount.Propagation
        ConvertTmpfsOptions(opt *mount.TmpfsOptions, readOnly bool) (string, error)
        DefaultCopyMode() bool
        ValidateVolumeName(name string) error
        ReadWrite(mode string) bool
        IsBackwardCompatible(m *MountPoint) bool
        HasResource(m *MountPoint, absPath string) bool
        ValidateTmpfsMountDestination(dest string) error
        ValidateMountConfig(mt *mount.Mount) error
}

// NewParser creates a parser for the current host OS
func NewParser() Parser {
        if runtime.GOOS == "windows" {
                return NewWindowsParser()
        }
        return NewLinuxParser()
}

package mounts

import (
        "os"
        "runtime"
        "testing"

        "github.com/moby/moby/api/types/mount"
        "gotest.tools/v3/assert"
        is "gotest.tools/v3/assert/cmp"
)

type mockFiProvider struct{}

func (mockFiProvider) fileInfo(path string) (exists, isDir bool, _ error) {
        dirs := map[string]struct{}{
                `c:\`:                    {},
                `c:\windows\`:            {},
                `c:\windows`:             {},
                `c:\program files`:       {},
                `c:\Windows`:             {},
                `c:\Program Files (x86)`: {},
                `\\?\c:\windows\`:        {},
        }
        files := map[string]struct{}{
                `c:\windows\system32\ntdll.dll`: {},
        }
        if _, ok := dirs[path]; ok {
                return true, true, nil
        }
        if _, ok := files[path]; ok {
                return true, false, nil
        }
        return false, false, nil
}

// always returns the configured error
// this is used to test error handling
type mockFiProviderWithError struct{ err error }

func (m mockFiProviderWithError) fileInfo(path string) (bool, bool, error) {
        return false, false, m.err
}

func TestParseMountSpec(t *testing.T) {
        testDir := t.TempDir()
        parser := NewParser()
        tests := []struct {
                input    mount.Mount
                expected MountPoint
        }{
                {
                        input:    mount.Mount{Type: mount.TypeBind, Source: testDir, Target: testDestinationPath, ReadOnly: true},
                        expected: MountPoint{Type: mount.TypeBind, Source: testDir, Destination: testDestinationPath, Propagation: parser.DefaultPropagationMode()},
                },
                {
                        input:    mount.Mount{Type: mount.TypeBind, Source: testDir, Target: testDestinationPath},
                        expected: MountPoint{Type: mount.TypeBind, Source: testDir, Destination: testDestinationPath, RW: true, Propagation: parser.DefaultPropagationMode()},
                },
                {
                        input:    mount.Mount{Type: mount.TypeBind, Source: testDir + string(os.PathSeparator), Target: testDestinationPath, ReadOnly: true},
                        expected: MountPoint{Type: mount.TypeBind, Source: testDir, Destination: testDestinationPath, Propagation: parser.DefaultPropagationMode()},
                },
                {
                        input:    mount.Mount{Type: mount.TypeBind, Source: testDir, Target: testDestinationPath + string(os.PathSeparator), ReadOnly: true},
                        expected: MountPoint{Type: mount.TypeBind, Source: testDir, Destination: testDestinationPath, Propagation: parser.DefaultPropagationMode()},
                },
                {
                        input:    mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath},
                        expected: MountPoint{Type: mount.TypeVolume, Destination: testDestinationPath, RW: true, CopyData: parser.DefaultCopyMode()},
                },
                {
                        input:    mount.Mount{Type: mount.TypeVolume, Target: testDestinationPath + string(os.PathSeparator)},
                        expected: MountPoint{Type: mount.TypeVolume, Destination: testDestinationPath, RW: true, CopyData: parser.DefaultCopyMode()},
                },
        }

        if runtime.GOOS != "windows" {
                tests = append(tests, struct {
                        input    mount.Mount
                        expected MountPoint
                }{
                        input:    mount.Mount{Type: mount.TypeImage, Source: "alpine", Target: testDestinationPath},
                        expected: MountPoint{Type: mount.TypeImage, Source: "alpine", Destination: testDestinationPath, RW: true, Propagation: parser.DefaultPropagationMode()},
                })
        }

        for _, tc := range tests {
                t.Run("", func(t *testing.T) {
                        mp, err := parser.ParseMountSpec(tc.input)
                        assert.NilError(t, err)

                        assert.Check(t, is.Equal(mp.Type, tc.expected.Type))
                        assert.Check(t, is.Equal(mp.Destination, tc.expected.Destination))
                        assert.Check(t, is.Equal(mp.Source, tc.expected.Source))
                        assert.Check(t, is.Equal(mp.RW, tc.expected.RW))
                        assert.Check(t, is.Equal(mp.Propagation, tc.expected.Propagation))
                        assert.Check(t, is.Equal(mp.Driver, tc.expected.Driver))
                        assert.Check(t, is.Equal(mp.CopyData, tc.expected.CopyData))
                })
        }
}

package mounts

import (
        "fmt"

        "github.com/moby/moby/api/types/mount"
        "github.com/pkg/errors"
)

type errMountConfig struct {
        mount *mount.Mount
        err   error
}

func (e *errMountConfig) Error() string {
        return fmt.Sprintf("invalid mount config for type %q: %v", e.mount.Type, e.err.Error())
}

func errBindSourceDoesNotExist(path string) error {
        return errors.Errorf("bind source path does not exist: %s", path)
}

func errExtraField(name string) error {
        return errors.Errorf("field %s must not be specified", name)
}

func errMissingField(name string) error {
        return errors.Errorf("field %s must not be empty", name)
}

package mounts

import "strings"

// {<copy mode>=isEnabled}
var copyModes = map[string]bool{
        "nocopy": false,
}

func copyModeExists(mode string) bool {
        _, exists := copyModes[mode]
        return exists
}

// GetCopyMode gets the copy mode from the mode string for mounts
func getCopyMode(mode string, def bool) (bool, bool) {
        for _, o := range strings.Split(mode, ",") {
                if isEnabled, exists := copyModes[o]; exists {
                        return isEnabled, true
                }
        }
        return def, false
}

//go:build linux || freebsd || darwin

package mounts

import (
        "fmt"
        "path/filepath"
        "strings"
)

func (p *linuxParser) HasResource(m *MountPoint, absolutePath string) bool {
        relPath, err := filepath.Rel(m.Destination, absolutePath)
        return err == nil && relPath != ".." && !strings.HasPrefix(relPath, fmt.Sprintf("..%c", filepath.Separator))
}

package mounts

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "strings"

        "github.com/docker/docker/internal/lazyregexp"
        "github.com/moby/moby/api/types/mount"
)

// NewWindowsParser creates a parser with Windows semantics.
func NewWindowsParser() Parser {
        return &windowsParser{
                fi: defaultFileInfoProvider{},
        }
}

type windowsParser struct {
        fi fileInfoProvider
}

const (
        // Spec should be in the format [source:]destination[:mode]
        //
        // Examples: c:\foo bar:d:rw
        //           c:\foo:d:\bar
        //           myname:d:
        //           d:\
        //
        // Explanation of this regex! Thanks @thaJeztah on IRC and gist for help. See
        // https://gist.github.com/thaJeztah/6185659e4978789fb2b2. A good place to
        // test is https://regex-golang.appspot.com/assets/html/index.html
        //
        // Useful link for referencing named capturing groups:
        // http://stackoverflow.com/questions/20750843/using-named-matches-from-go-regex
        //
        // There are three match groups: source, destination and mode.
        //

        // rxHostDir is the first option of a source
        rxHostDir = `(?:\\\\\?\\)?[a-z]:[\\/](?:[^\\/:*?"<>|\r\n]+[\\/]?)*`
        // rxName is the second option of a source
        rxName = `[^\\/:*?"<>|\r\n]+`

        // RXReservedNames are reserved names not possible on Windows
        rxReservedNames = `(con|prn|nul|aux|com[1-9]|lpt[1-9])`

        // rxPipe is a named path pipe (starts with `\\.\pipe\`, possibly with / instead of \)
        rxPipe = `[/\\]{2}.[/\\]pipe[/\\][^:*?"<>|\r\n]+`
        // rxSource is the combined possibilities for a source
        rxSource = `((?P<source>((` + rxHostDir + `)|(` + rxName + `)|(` + rxPipe + `))):)?`

        // Source. Can be either a host directory, a name, or omitted:
        //  HostDir:
        //    -  Essentially using the folder solution from
        //       https://www.safaribooksonline.com/library/view/regular-expressions-cookbook/9781449327453/ch08s18.html
        //       but adding case insensitivity.
        //    -  Must be an absolute path such as c:\path
        //    -  Can include spaces such as `c:\program files`
        //    -  And then followed by a colon which is not in the capture group
        //    -  And can be optional
        //  Name:
        //    -  Must not contain invalid NTFS filename characters (https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
        //    -  And then followed by a colon which is not in the capture group
        //    -  And can be optional

        // rxDestination is the regex expression for the mount destination
        rxDestination = `(?P<destination>((?:\\\\\?\\)?([a-z]):((?:[\\/][^\\/:*?"<>\r\n]+)*[\\/]?))|(` + rxPipe + `))`

        // rxMode is the regex expression for the mode of the mount
        // Mode (optional):
        //    -  Hopefully self explanatory in comparison to above regex's.
        //    -  Colon is not in the capture group
        rxMode = `(:(?P<mode>(?i)ro|rw))?`
)

var (
        volumeNameRegexp          = lazyregexp.New(`^` + rxName + `$`)
        reservedNameRegexp        = lazyregexp.New(`^` + rxReservedNames + `$`)
        hostDirRegexp             = lazyregexp.New(`^` + rxHostDir + `$`)
        mountDestinationRegexp    = lazyregexp.New(`^` + rxDestination + `$`)
        windowsSplitRawSpecRegexp = lazyregexp.New(`^` + rxSource + rxDestination + rxMode + `$`)
)

type mountValidator func(mnt *mount.Mount) error

func (p *windowsParser) splitRawSpec(raw string, splitRegexp *lazyregexp.Regexp) ([]string, error) {
        match := splitRegexp.FindStringSubmatch(strings.ToLower(raw))
        if len(match) == 0 {
                return nil, errInvalidSpec(raw)
        }

        var split []string
        matchgroups := make(map[string]string)
        // Pull out the sub expressions from the named capture groups
        for i, name := range splitRegexp.SubexpNames() {
                matchgroups[name] = strings.ToLower(match[i])
        }
        if source, exists := matchgroups["source"]; exists {
                if source != "" {
                        split = append(split, source)
                }
        }
        if destination, exists := matchgroups["destination"]; exists {
                if destination != "" {
                        split = append(split, destination)
                }
        }
        if mode, exists := matchgroups["mode"]; exists {
                if mode != "" {
                        split = append(split, mode)
                }
        }
        // Fix #26329. If the destination appears to be a file, and the source is null,
        // it may be because we've fallen through the possible naming regex and hit a
        // situation where the user intention was to map a file into a container through
        // a local volume, but this is not supported by the platform.
        if matchgroups["source"] == "" && matchgroups["destination"] != "" {
                if volumeNameRegexp.MatchString(matchgroups["destination"]) {
                        if reservedNameRegexp.MatchString(matchgroups["destination"]) {
                                return nil, fmt.Errorf("volume name %q cannot be a reserved word for Windows filenames", matchgroups["destination"])
                        }
                } else {
                        exists, isDir, _ := p.fi.fileInfo(matchgroups["destination"])
                        if exists && !isDir {
                                return nil, fmt.Errorf("file '%s' cannot be mapped. Only directories can be mapped on this platform", matchgroups["destination"])
                        }
                }
        }
        return split, nil
}

func windowsValidMountMode(mode string) bool {
        if mode == "" {
                return true
        }
        // TODO should windows mounts produce an error if any mode was provided (they're a no-op on windows)
        return rwModes[strings.ToLower(mode)]
}

func windowsValidateNotRoot(p string) error {
        p = strings.ToLower(strings.ReplaceAll(p, `/`, `\`))
        if p == "c:" || p == `c:\` {
                return fmt.Errorf(`destination path (%v) cannot be 'c:' or 'c:\'`, p)
        }
        return nil
}

var windowsValidators mountValidator = func(m *mount.Mount) error {
        if err := windowsValidateNotRoot(m.Target); err != nil {
                return err
        }
        if !mountDestinationRegexp.MatchString(strings.ToLower(m.Target)) {
                return fmt.Errorf("invalid mount path: '%s'", m.Target)
        }
        return nil
}

func windowsValidateAbsolute(p string) error {
        if !mountDestinationRegexp.MatchString(strings.ToLower(p)) {
                return fmt.Errorf("invalid mount path: '%s' mount path must be absolute", p)
        }
        return nil
}

func windowsDetectMountType(p string) mount.Type {
        switch {
        case strings.HasPrefix(p, `\\.\pipe\`):
                return mount.TypeNamedPipe
        case hostDirRegexp.MatchString(p):
                return mount.TypeBind
        default:
                return mount.TypeVolume
        }
}

func (p *windowsParser) ReadWrite(mode string) bool {
        return strings.ToLower(mode) != "ro"
}

// ValidateVolumeName checks a volume name in a platform specific manner.
func (p *windowsParser) ValidateVolumeName(name string) error {
        if !volumeNameRegexp.MatchString(name) {
                return errors.New("invalid volume name")
        }
        if reservedNameRegexp.MatchString(name) {
                return fmt.Errorf("volume name %q cannot be a reserved word for Windows filenames", name)
        }
        return nil
}

func (p *windowsParser) ValidateMountConfig(mnt *mount.Mount) error {
        return p.validateMountConfigReg(mnt, windowsValidators)
}

type fileInfoProvider interface {
        fileInfo(path string) (exist, isDir bool, err error)
}

type defaultFileInfoProvider struct{}

func (defaultFileInfoProvider) fileInfo(path string) (exist, isDir bool, _ error) {
        fi, err := os.Stat(path)
        if err != nil {
                if !os.IsNotExist(err) {
                        return false, false, err
                }
                return false, false, nil
        }
        return true, fi.IsDir(), nil
}

func (p *windowsParser) validateMountConfigReg(mnt *mount.Mount, additionalValidators ...mountValidator) error {
        if mnt.Target == "" {
                return &errMountConfig{mnt, errMissingField("Target")}
        }
        for _, v := range additionalValidators {
                if err := v(mnt); err != nil {
                        return &errMountConfig{mnt, err}
                }
        }

        switch mnt.Type {
        case mount.TypeBind:
                if mnt.Source == "" {
                        return &errMountConfig{mnt, errMissingField("Source")}
                }
                // Don't error out just because the propagation mode is not supported on the platform
                if opts := mnt.BindOptions; opts != nil {
                        if len(opts.Propagation) > 0 {
                                return &errMountConfig{mnt, fmt.Errorf("invalid propagation mode: %s", opts.Propagation)}
                        }
                }
                if mnt.VolumeOptions != nil {
                        return &errMountConfig{mnt, errExtraField("VolumeOptions")}
                }

                if err := windowsValidateAbsolute(mnt.Source); err != nil {
                        return &errMountConfig{mnt, err}
                }

                exists, isdir, err := p.fi.fileInfo(mnt.Source)
                if err != nil {
                        return &errMountConfig{mnt, err}
                }
                if !exists {
                        return &errMountConfig{mnt, errBindSourceDoesNotExist(mnt.Source)}
                }
                if !isdir {
                        return &errMountConfig{mnt, errors.New("source path must be a directory")}
                }

        case mount.TypeVolume:
                if mnt.BindOptions != nil {
                        return &errMountConfig{mnt, errExtraField("BindOptions")}
                }

                anonymousVolume := mnt.Source == ""
                if mnt.VolumeOptions != nil && mnt.VolumeOptions.Subpath != "" {
                        if anonymousVolume {
                                return errAnonymousVolumeWithSubpath
                        }

                        // Check if path is relative but without any back traversals
                        if !filepath.IsLocal(mnt.VolumeOptions.Subpath) {
                                return &errMountConfig{mnt, errInvalidSubpath}
                        }
                }

                if anonymousVolume && mnt.ReadOnly {
                        return &errMountConfig{mnt, errors.New("must not set ReadOnly mode when using anonymous volumes")}
                }

                if mnt.Source != "" {
                        if err := p.ValidateVolumeName(mnt.Source); err != nil {
                                return &errMountConfig{mnt, err}
                        }
                }
        case mount.TypeNamedPipe:
                if mnt.Source == "" {
                        return &errMountConfig{mnt, errMissingField("Source")}
                }

                if mnt.BindOptions != nil {
                        return &errMountConfig{mnt, errExtraField("BindOptions")}
                }

                if mnt.ReadOnly {
                        return &errMountConfig{mnt, errExtraField("ReadOnly")}
                }

                if windowsDetectMountType(mnt.Source) != mount.TypeNamedPipe {
                        return &errMountConfig{mnt, fmt.Errorf("'%s' is not a valid pipe path", mnt.Source)}
                }

                if windowsDetectMountType(mnt.Target) != mount.TypeNamedPipe {
                        return &errMountConfig{mnt, fmt.Errorf("'%s' is not a valid pipe path", mnt.Target)}
                }
        default:
                return &errMountConfig{mnt, errors.New("mount type unknown")}
        }
        return nil
}

func (p *windowsParser) ParseMountRaw(raw, volumeDriver string) (*MountPoint, error) {
        arr, err := p.splitRawSpec(raw, windowsSplitRawSpecRegexp)
        if err != nil {
                return nil, err
        }
        return p.parseMount(arr, raw, volumeDriver, true, windowsValidators)
}

func (p *windowsParser) parseMount(arr []string, raw, volumeDriver string, convertTargetToBackslash bool, additionalValidators ...mountValidator) (*MountPoint, error) {
        var spec mount.Mount
        var mode string
        switch len(arr) {
        case 1:
                // Just a destination path in the container
                spec.Target = arr[0]
        case 2:
                if windowsValidMountMode(arr[1]) {
                        // Destination + Mode is not a valid volume - volumes
                        // cannot include a mode. e.g. /foo:rw
                        return nil, errInvalidSpec(raw)
                }
                // Host Source Path or Name + Destination
                spec.Source = strings.ReplaceAll(arr[0], `/`, `\`)
                spec.Target = arr[1]
        case 3:
                // HostSourcePath+DestinationPath+Mode
                spec.Source = strings.ReplaceAll(arr[0], `/`, `\`)
                spec.Target = arr[1]
                mode = arr[2]
        default:
                return nil, errInvalidSpec(raw)
        }
        if convertTargetToBackslash {
                spec.Target = strings.ReplaceAll(spec.Target, `/`, `\`)
        }

        if !windowsValidMountMode(mode) {
                return nil, errInvalidMode(mode)
        }

        spec.Type = windowsDetectMountType(spec.Source)
        spec.ReadOnly = !p.ReadWrite(mode)

        // cannot assume that if a volume driver is passed in that we should set it
        if volumeDriver != "" && spec.Type == mount.TypeVolume {
                spec.VolumeOptions = &mount.VolumeOptions{
                        DriverConfig: &mount.Driver{Name: volumeDriver},
                }
        }

        if copyData, isSet := getCopyMode(mode, p.DefaultCopyMode()); isSet {
                if spec.VolumeOptions == nil {
                        spec.VolumeOptions = &mount.VolumeOptions{}
                }
                spec.VolumeOptions.NoCopy = !copyData
        }

        mp, err := p.parseMountSpec(spec, convertTargetToBackslash, additionalValidators...)
        if mp != nil {
                mp.Mode = mode
        }
        if err != nil {
                err = fmt.Errorf("%v: %v", errInvalidSpec(raw), err)
        }
        return mp, err
}

func (p *windowsParser) ParseMountSpec(cfg mount.Mount) (*MountPoint, error) {
        return p.parseMountSpec(cfg, true, windowsValidators)
}

func (p *windowsParser) parseMountSpec(cfg mount.Mount, convertTargetToBackslash bool, additionalValidators ...mountValidator) (*MountPoint, error) {
        if err := p.validateMountConfigReg(&cfg, additionalValidators...); err != nil {
                return nil, err
        }
        mp := &MountPoint{
                RW:          !cfg.ReadOnly,
                Destination: cfg.Target,
                Type:        cfg.Type,
                Spec:        cfg,
        }
        if convertTargetToBackslash {
                mp.Destination = strings.ReplaceAll(cfg.Target, `/`, `\`)
        }

        switch cfg.Type {
        case mount.TypeVolume:
                if cfg.Source != "" {
                        // non-anonymous volume
                        mp.Name = cfg.Source
                }
                mp.CopyData = p.DefaultCopyMode()

                if cfg.VolumeOptions != nil {
                        if cfg.VolumeOptions.DriverConfig != nil {
                                mp.Driver = cfg.VolumeOptions.DriverConfig.Name
                        }
                        if cfg.VolumeOptions.NoCopy {
                                mp.CopyData = false
                        }
                }
        case mount.TypeBind:
                mp.Source = strings.ReplaceAll(cfg.Source, `/`, `\`)
        case mount.TypeNamedPipe:
                mp.Source = strings.ReplaceAll(cfg.Source, `/`, `\`)
        default:
                // TODO(thaJeztah): make switch exhaustive: anything to do for mount.TypeTmpfs, mount.TypeCluster, mount.TypeImage ?
        }
        // cleanup trailing `\` except for paths like `c:\`
        if len(mp.Source) > 3 && mp.Source[len(mp.Source)-1] == '\\' {
                mp.Source = mp.Source[:len(mp.Source)-1]
        }
        if len(mp.Destination) > 3 && mp.Destination[len(mp.Destination)-1] == '\\' {
                mp.Destination = mp.Destination[:len(mp.Destination)-1]
        }
        return mp, nil
}

func (p *windowsParser) ParseVolumesFrom(spec string) (string, string, error) {
        if spec == "" {
                return "", "", errors.New("volumes-from specification cannot be an empty string")
        }

        id, mode, _ := strings.Cut(spec, ":")
        if mode == "" {
                return id, "rw", nil
        }

        if !windowsValidMountMode(mode) {
                return "", "", errInvalidMode(mode)
        }

        // Do not allow copy modes on volumes-from
        if _, isSet := getCopyMode(mode, p.DefaultCopyMode()); isSet {
                return "", "", errInvalidMode(mode)
        }
        return id, mode, nil
}

func (p *windowsParser) DefaultPropagationMode() mount.Propagation {
        return ""
}

func (p *windowsParser) ConvertTmpfsOptions(opt *mount.TmpfsOptions, readOnly bool) (string, error) {
        return "", errors.New("windows does not support tmpfs")
}

func (p *windowsParser) DefaultCopyMode() bool {
        return false
}

func (p *windowsParser) IsBackwardCompatible(m *MountPoint) bool {
        return false
}

func (p *windowsParser) ValidateTmpfsMountDestination(dest string) error {
        return errors.New("windows does not support tmpfs")
}

func (p *windowsParser) HasResource(m *MountPoint, absolutePath string) bool {
        return false
}

package safepath

import (
        "os"
        "path/filepath"

        "github.com/pkg/errors"
)

// evaluatePath evaluates symlinks in the concatenation of path and subpath. If
// err is nil, resolvedBasePath will contain result of resolving all symlinks
// in the given path, and resolvedSubpath will contain a relative path rooted
// at the resolvedBasePath pointing to the concatenation after resolving all
// symlinks.
func evaluatePath(path, subpath string) (resolvedBasePath string, resolvedSubpath string, err error) {
        baseResolved, err := filepath.EvalSymlinks(path)
        if err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        return "", "", &ErrNotAccessible{Path: path, Cause: err}
                }
                return "", "", errors.Wrapf(err, "error while resolving symlinks in base directory %q", path)
        }

        combinedPath := filepath.Join(baseResolved, subpath)
        combinedResolved, err := filepath.EvalSymlinks(combinedPath)
        if err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        return "", "", &ErrNotAccessible{Path: combinedPath, Cause: err}
                }
                return "", "", errors.Wrapf(err, "error while resolving symlinks in combined path %q", combinedPath)
        }

        subpart, err := filepath.Rel(baseResolved, combinedResolved)
        if err != nil {
                return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
        }

        if !filepath.IsLocal(subpart) {
                return "", "", &ErrEscapesBase{Base: baseResolved, Subpath: subpath}
        }

        return baseResolved, subpart, nil
}

// isLocalTo reports whether path, using lexical analysis only, has all of these properties:
//   - is within the subtree rooted at basepath
//   - is not empty
//   - on Windows, is not a reserved name such as "NUL"
//
// If isLocalTo(path, basepath) returns true, then
//
//        filepath.Rel(basepath, path)
//
// will always produce an unrooted path with no `..` elements.
//
// isLocalTo is a purely lexical operation. In particular, it does not account for the effect of any symbolic links that may exist in the filesystem.
//
// Both path and basepath are expected to be absolute paths.
func isLocalTo(path, basepath string) bool {
        rel, err := filepath.Rel(basepath, path)
        if err != nil {
                return false
        }

        return filepath.IsLocal(rel)
}

package safepath

// ErrNotAccessible is returned by Join when the resulting path doesn't exist,
// is not accessible, or any of the path components was replaced with a symlink
// during the path traversal.
type ErrNotAccessible struct {
        Path  string
        Cause error
}

func (*ErrNotAccessible) NotFound() {}

func (e *ErrNotAccessible) Unwrap() error {
        return e.Cause
}

func (e *ErrNotAccessible) Error() string {
        msg := "cannot access path " + e.Path
        if e.Cause != nil {
                msg += ": " + e.Cause.Error()
        }
        return msg
}

// ErrEscapesBase is returned by Join when the resulting concatenation would
// point outside of the specified base directory.
type ErrEscapesBase struct {
        Base, Subpath string
}

func (*ErrEscapesBase) InvalidParameter() {}

func (e *ErrEscapesBase) Error() string {
        msg := "path concatenation escapes the base directory"
        if e.Base != "" {
                msg += ", base: " + e.Base
        }
        if e.Subpath != "" {
                msg += ", subpath: " + e.Subpath
        }
        return msg
}

package safepath

import (
        "context"
        "os"
        "path/filepath"
        "runtime"
        "strconv"

        "github.com/containerd/log"
        "github.com/docker/docker/internal/unix_noeintr"
        "github.com/pkg/errors"
        "golang.org/x/sys/unix"
)

// Join makes sure that the concatenation of path and subpath doesn't
// resolve to a path outside of path and returns a path to a temporary file that is
// a bind mount to the exact same file/directory that was validated.
//
// After use, it is the caller's responsibility to call Close on the returned
// SafePath object, which will unmount the temporary file/directory
// and remove it.
func Join(ctx context.Context, path, subpath string) (*SafePath, error) {
        base, subpart, err := evaluatePath(path, subpath)
        if err != nil {
                return nil, err
        }

        runtime.LockOSThread()
        defer runtime.UnlockOSThread()
        fd, err := safeOpenFd(base, subpart)
        if err != nil {
                return nil, err
        }

        defer func() {
                if err := unix_noeintr.Close(fd); err != nil {
                        log.G(ctx).WithError(err).Errorf("Closing FD %d failed for safeOpenFd(%s, %s)", fd, base, subpart)
                }
        }()

        tmpMount, err := tempMountPoint(fd)
        if err != nil {
                return nil, errors.Wrap(err, "failed to create temporary file for safe mount")
        }

        if err := unix_noeintr.Mount("/proc/self/fd/"+strconv.Itoa(fd), tmpMount, "none", unix.MS_BIND, ""); err != nil {
                if err := os.Remove(tmpMount); err != nil {
                        log.G(ctx).WithError(err).Warn("failed to remove tmpMount after failed mount")
                }
                return nil, errors.Wrap(err, "failed to mount resolved path")
        }

        return &SafePath{
                path:          tmpMount,
                sourceBase:    base,
                sourceSubpath: subpart,
                cleanup:       cleanupSafePath(tmpMount),
        }, nil
}

// safeOpenFd opens the file at filepath.Join(path, subpath) in O_PATH
// mode and returns the file descriptor if subpath is within the subtree
// rooted at path. It is an error if any of components of path or subpath
// are symbolic links.
//
// It is a caller's responsibility to close the returned file descriptor, if no
// error was returned.
func safeOpenFd(path, subpath string) (int, error) {
        // Open base volume path (_data directory).
        prevFd, err := unix_noeintr.Open(path, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC|unix.O_NOFOLLOW, 0)
        if err != nil {
                return -1, &ErrNotAccessible{Path: path, Cause: err}
        }
        defer unix_noeintr.Close(prevFd)

        // Try to use the Openat2 syscall first (available on Linux 5.6+).
        fd, err := unix_noeintr.Openat2(prevFd, subpath, &unix.OpenHow{
                Flags:   unix.O_PATH | unix.O_CLOEXEC,
                Mode:    0,
                Resolve: unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS | unix.RESOLVE_NO_SYMLINKS,
        })

        switch {
        case errors.Is(err, unix.ENOSYS):
                // Openat2 is not available, fallback to Openat loop.
                return kubernetesSafeOpen(path, subpath)
        case errors.Is(err, unix.EXDEV):
                return -1, &ErrEscapesBase{Base: path, Subpath: subpath}
        case errors.Is(err, unix.ENOENT), errors.Is(err, unix.ELOOP):
                return -1, &ErrNotAccessible{Path: filepath.Join(path, subpath), Cause: err}
        case err != nil:
                return -1, &os.PathError{Op: "openat2", Path: subpath, Err: err}
        }

        // Openat2 is available and succeeded.
        return fd, nil
}

// tempMountPoint creates a temporary file/directory to act as mount
// point for the file descriptor.
func tempMountPoint(sourceFd int) (string, error) {
        var stat unix.Stat_t
        err := unix_noeintr.Fstat(sourceFd, &stat)
        if err != nil {
                return "", errors.Wrap(err, "failed to Fstat mount source fd")
        }

        isDir := (stat.Mode & unix.S_IFMT) == unix.S_IFDIR
        if isDir {
                return os.MkdirTemp("", "safe-mount")
        }

        f, err := os.CreateTemp("", "safe-mount")
        if err != nil {
                return "", err
        }

        p := f.Name()
        if err := f.Close(); err != nil {
                return "", err
        }
        return p, nil
}

// cleanupSafePath returns a function that unmounts the path and removes the
// mountpoint.
func cleanupSafePath(path string) func(context.Context) error {
        return func(ctx context.Context) error {
                log.G(ctx).WithField("path", path).Debug("removing safe temp mount")

                if err := unix_noeintr.Unmount(path, unix.MNT_DETACH); err != nil {
                        if errors.Is(err, unix.EINVAL) {
                                log.G(ctx).WithField("path", path).Warn("safe temp mount no longer exists?")
                                return nil
                        }
                        return errors.Wrapf(err, "error unmounting safe mount %s", path)
                }
                if err := os.Remove(path); err != nil {
                        if errors.Is(err, os.ErrNotExist) {
                                log.G(ctx).WithField("path", path).Warn("safe temp mount no longer exists?")
                                return nil
                        }
                        return errors.Wrapf(err, "failed to delete temporary safe mount")
                }

                return nil
        }
}

package safepath

/*
Copyright 2014 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

import (
        "context"
        "fmt"
        "path/filepath"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/internal/unix_noeintr"
        "golang.org/x/sys/unix"
)

// kubernetesSafeOpen open path formed by concatenation of the base directory
// and its subpath and return its fd.
// Symlinks are disallowed (pathname must already resolve symlinks) and the
// path must be within the base directory.
// This is minimally modified code from https://github.com/kubernetes/kubernetes/blob/55fb1805a1217b91b36fa8fe8f2bf3a28af2454d/pkg/volume/util/subpath/subpath_linux.go#L530
func kubernetesSafeOpen(base, subpath string) (int, error) {
        // syscall.Openat flags used to traverse directories not following symlinks
        const nofollowFlags = unix.O_RDONLY | unix.O_NOFOLLOW
        // flags for getting file descriptor without following the symlink
        const openFDFlags = unix.O_NOFOLLOW | unix.O_PATH

        pathname := filepath.Join(base, subpath)
        segments := strings.Split(subpath, string(filepath.Separator))

        // Assumption: base is the only directory that we have under control.
        // Base dir is not allowed to be a symlink.
        parentFD, err := unix_noeintr.Open(base, nofollowFlags|unix.O_CLOEXEC, 0)
        if err != nil {
                return -1, &ErrNotAccessible{Path: base, Cause: err}
        }
        defer func() {
                if parentFD != -1 {
                        if err = unix_noeintr.Close(parentFD); err != nil {
                                log.G(context.TODO()).Errorf("Closing FD %v failed for safeopen(%v): %v", parentFD, pathname, err)
                        }
                }
        }()

        childFD := -1
        defer func() {
                if childFD != -1 {
                        if err = unix_noeintr.Close(childFD); err != nil {
                                log.G(context.TODO()).Errorf("Closing FD %v failed for safeopen(%v): %v", childFD, pathname, err)
                        }
                }
        }()

        currentPath := base

        // Follow the segments one by one using openat() to make
        // sure the user cannot change already existing directories into symlinks.
        for _, seg := range segments {
                var deviceStat unix.Stat_t

                currentPath = filepath.Join(currentPath, seg)
                if !isLocalTo(currentPath, base) {
                        return -1, &ErrEscapesBase{Base: currentPath, Subpath: seg}
                }

                // Trigger auto mount if it's an auto-mounted directory, ignore error if not a directory.
                // Notice the trailing slash is mandatory, see "automount" in openat(2) and open_by_handle_at(2).
                _ = unix_noeintr.Fstatat(parentFD, seg+"/", &deviceStat, unix.AT_SYMLINK_NOFOLLOW)

                log.G(context.TODO()).Debugf("Opening path %s", currentPath)
                childFD, err = unix_noeintr.Openat(parentFD, seg, openFDFlags|unix.O_CLOEXEC, 0)
                if err != nil {
                        return -1, &ErrNotAccessible{Path: currentPath, Cause: err}
                }

                err := unix_noeintr.Fstat(childFD, &deviceStat)
                if err != nil {
                        return -1, fmt.Errorf("error running fstat on %s with %v", currentPath, err)
                }
                fileFmt := deviceStat.Mode & unix.S_IFMT
                if fileFmt == unix.S_IFLNK {
                        return -1, fmt.Errorf("unexpected symlink found %s", currentPath)
                }

                // Close parentFD
                if err = unix_noeintr.Close(parentFD); err != nil {
                        return -1, fmt.Errorf("closing fd for %q failed: %v", filepath.Dir(currentPath), err)
                }
                // Set child to new parent
                parentFD = childFD
                childFD = -1
        }

        // We made it to the end, return this fd, don't close it
        finalFD := parentFD
        parentFD = -1

        return finalFD, nil
}

package safepath

import (
        "context"
        "fmt"
        "sync"

        "github.com/containerd/log"
)

type SafePath struct {
        path    string
        cleanup func(ctx context.Context) error
        mutex   sync.Mutex

        // Immutable fields
        sourceBase, sourceSubpath string
}

// Close releases the resources used by the path.
func (s *SafePath) Close(ctx context.Context) error {
        s.mutex.Lock()
        defer s.mutex.Unlock()

        if s.path == "" {
                base, sub := s.SourcePath()
                log.G(ctx).WithFields(log.Fields{
                        "path":          s.Path(),
                        "sourceBase":    base,
                        "sourceSubpath": sub,
                }).Warn("an attempt to close an already closed SafePath")
                return nil
        }

        s.path = ""
        if s.cleanup != nil {
                return s.cleanup(ctx)
        }
        return nil
}

// IsValid return true when path can still be used and wasn't cleaned up by Close.
func (s *SafePath) IsValid() bool {
        s.mutex.Lock()
        defer s.mutex.Unlock()
        return s.path != ""
}

// Path returns a safe, temporary path that can be used to access the original path.
func (s *SafePath) Path() string {
        s.mutex.Lock()
        defer s.mutex.Unlock()
        if s.path == "" {
                panic(fmt.Sprintf("use-after-close attempted for safepath with source [%s, %s]", s.sourceBase, s.sourceSubpath))
        }
        return s.path
}

// SourcePath returns the source path the safepath points to.
func (s *SafePath) SourcePath() (string, string) {
        // No mutex lock because these are immutable.
        return s.sourceBase, s.sourceSubpath
}

package distribution

import (
        "context"
        "encoding/json"
        "io"
        "runtime"

        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/distribution/manifest/schema2"
        "github.com/docker/docker/distribution/metadata"
        "github.com/docker/docker/distribution/xfer"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/progress"
        refstore "github.com/docker/docker/reference"
        registrypkg "github.com/docker/docker/registry"
        "github.com/moby/moby/api/types/events"
        "github.com/moby/moby/api/types/registry"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// Config stores configuration for communicating
// with a registry.
type Config struct {
        // MetaHeaders stores HTTP headers with metadata about the image
        MetaHeaders map[string][]string
        // AuthConfig holds authentication credentials for authenticating with
        // the registry.
        AuthConfig *registry.AuthConfig
        // ProgressOutput is the interface for showing the status of the pull
        // operation.
        ProgressOutput progress.Output
        // RegistryService is the registry service to use for TLS configuration
        // and endpoint lookup.
        RegistryService RegistryResolver
        // ImageEventLogger notifies events for a given image
        ImageEventLogger func(ctx context.Context, id, name string, action events.Action)
        // MetadataStore is the storage backend for distribution-specific
        // metadata.
        MetadataStore metadata.Store
        // ImageStore manages images.
        ImageStore ImageConfigStore
        // ReferenceStore manages tags. This value is optional, when excluded
        // content will not be tagged.
        ReferenceStore refstore.Store
}

// ImagePullConfig stores pull configuration.
type ImagePullConfig struct {
        Config

        // DownloadManager manages concurrent pulls.
        DownloadManager *xfer.LayerDownloadManager
        // Schema2Types is an optional list of valid schema2 configuration types
        // allowed by the pull operation. If omitted, the default list of accepted
        // types is used.
        Schema2Types []string
        // Platform is the requested platform of the image being pulled
        Platform *ocispec.Platform
}

// ImagePushConfig stores push configuration.
type ImagePushConfig struct {
        Config

        // ConfigMediaType is the configuration media type for
        // schema2 manifests.
        ConfigMediaType string
        // LayerStores manages layers.
        LayerStores PushLayerProvider
        // UploadManager dispatches uploads.
        UploadManager *xfer.LayerUploadManager
}

// RegistryResolver is used for TLS configuration and endpoint lookup.
type RegistryResolver interface {
        ResolveAuthConfig(map[string]registry.AuthConfig, reference.Named) registry.AuthConfig
        LookupPushEndpoints(hostname string) (endpoints []registrypkg.APIEndpoint, err error)
        LookupPullEndpoints(hostname string) (endpoints []registrypkg.APIEndpoint, err error)
}

// ImageConfigStore handles storing and getting image configurations
// by digest. Allows getting an image configurations rootfs from the
// configuration.
type ImageConfigStore interface {
        Put(context.Context, []byte) (digest.Digest, error)
        Get(context.Context, digest.Digest) ([]byte, error)
}

// PushLayerProvider provides layers to be pushed by ChainID.
type PushLayerProvider interface {
        Get(layer.ChainID) (PushLayer, error)
}

// PushLayer is a pushable layer with metadata about the layer
// and access to the content of the layer.
type PushLayer interface {
        ChainID() layer.ChainID
        DiffID() layer.DiffID
        Parent() PushLayer
        Open() (io.ReadCloser, error)
        Size() int64
        MediaType() string
        Release()
}

type imageConfigStore struct {
        image.Store
}

// NewImageConfigStoreFromStore returns an ImageConfigStore backed
// by an image.Store for container images.
func NewImageConfigStoreFromStore(is image.Store) ImageConfigStore {
        return &imageConfigStore{
                Store: is,
        }
}

func (s *imageConfigStore) Put(_ context.Context, c []byte) (digest.Digest, error) {
        id, err := s.Store.Create(c)
        return digest.Digest(id), err
}

func (s *imageConfigStore) Get(_ context.Context, d digest.Digest) ([]byte, error) {
        img, err := s.Store.Get(image.ID(d))
        if err != nil {
                return nil, err
        }
        return img.RawJSON(), nil
}

func rootFSFromConfig(c []byte) (*image.RootFS, error) {
        var unmarshalledConfig image.Image
        if err := json.Unmarshal(c, &unmarshalledConfig); err != nil {
                return nil, err
        }
        return unmarshalledConfig.RootFS, nil
}

func platformFromConfig(c []byte) (*ocispec.Platform, error) {
        var unmarshalledConfig image.Image
        if err := json.Unmarshal(c, &unmarshalledConfig); err != nil {
                return nil, err
        }

        os := unmarshalledConfig.OS
        if os == "" {
                os = runtime.GOOS
        }
        if err := image.CheckOS(os); err != nil {
                return nil, errors.Wrapf(err, "image operating system %q cannot be used on this platform", os)
        }
        return &ocispec.Platform{
                OS:           os,
                Architecture: unmarshalledConfig.Architecture,
                Variant:      unmarshalledConfig.Variant,
                OSVersion:    unmarshalledConfig.OSVersion,
        }, nil
}

type storeLayerProvider struct {
        ls layer.Store
}

// NewLayerProvidersFromStore returns layer providers backed by
// an instance of LayerStore. Only getting layers as gzipped
// tars is supported.
func NewLayerProvidersFromStore(ls layer.Store) PushLayerProvider {
        return &storeLayerProvider{ls: ls}
}

func (p *storeLayerProvider) Get(lid layer.ChainID) (PushLayer, error) {
        if lid == "" {
                return &storeLayer{
                        Layer: layer.EmptyLayer,
                }, nil
        }
        l, err := p.ls.Get(lid)
        if err != nil {
                return nil, err
        }

        sl := storeLayer{
                Layer: l,
                ls:    p.ls,
        }
        if d, ok := l.(distribution.Describable); ok {
                return &describableStoreLayer{
                        storeLayer:  sl,
                        describable: d,
                }, nil
        }

        return &sl, nil
}

type storeLayer struct {
        layer.Layer
        ls layer.Store
}

func (l *storeLayer) Parent() PushLayer {
        p := l.Layer.Parent()
        if p == nil {
                return nil
        }
        sl := storeLayer{
                Layer: p,
                ls:    l.ls,
        }
        if d, ok := p.(distribution.Describable); ok {
                return &describableStoreLayer{
                        storeLayer:  sl,
                        describable: d,
                }
        }

        return &sl
}

func (l *storeLayer) Open() (io.ReadCloser, error) {
        return l.Layer.TarStream()
}

func (l *storeLayer) Size() int64 {
        return l.Layer.DiffSize()
}

func (l *storeLayer) MediaType() string {
        // layer store always returns uncompressed tars
        return schema2.MediaTypeUncompressedLayer
}

func (l *storeLayer) Release() {
        if l.ls != nil {
                layer.ReleaseAndLog(l.ls, l.Layer)
        }
}

type describableStoreLayer struct {
        storeLayer
        describable distribution.Describable
}

func (l *describableStoreLayer) Descriptor() distribution.Descriptor {
        return l.describable.Descriptor()
}

package distribution

import (
        "context"
        "fmt"
        "net/url"
        "strings"
        "syscall"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/distribution/registry/api/errcode"
        v2 "github.com/docker/distribution/registry/api/v2"
        "github.com/docker/distribution/registry/client"
        "github.com/docker/distribution/registry/client/auth"
        "github.com/docker/docker/distribution/xfer"
        "github.com/docker/docker/errdefs"
        "github.com/pkg/errors"
)

// fallbackError wraps an error that can possibly allow fallback to a different
// endpoint.
type fallbackError struct {
        // err is the error being wrapped.
        err error
        // transportOK is set to true if we managed to speak HTTP with the
        // registry. This confirms that we're using appropriate TLS settings
        // (or lack of TLS).
        transportOK bool
}

// Error renders the FallbackError as a string.
func (f fallbackError) Error() string {
        return f.err.Error()
}

func (f fallbackError) Cause() error {
        return f.err
}

func (f fallbackError) Unwrap() error {
        return f.err
}

type notFoundError struct {
        cause errcode.Error
        ref   reference.Named
}

func (e notFoundError) Error() string {
        switch e.cause.Code {
        case errcode.ErrorCodeDenied:
                // ErrorCodeDenied is used when access to the repository was denied
                return errors.Wrapf(e.cause, "pull access denied for %s, repository does not exist or may require 'docker login'", reference.FamiliarName(e.ref)).Error()
        case v2.ErrorCodeManifestUnknown:
                return errors.Wrapf(e.cause, "manifest for %s not found", reference.FamiliarString(e.ref)).Error()
        case v2.ErrorCodeNameUnknown:
                return errors.Wrapf(e.cause, "repository %s not found", reference.FamiliarName(e.ref)).Error()
        }
        // Shouldn't get here, but this is better than returning an empty string
        return e.cause.Message
}

func (e notFoundError) NotFound() {}

func (e notFoundError) Cause() error {
        return e.cause
}

func (e notFoundError) Unwrap() error {
        return e.cause
}

// unsupportedMediaTypeError is an error issued when attempted
// to pull unsupported content.
type unsupportedMediaTypeError struct {
        MediaType string
}

func (e unsupportedMediaTypeError) InvalidParameter() {}

// Error returns the error string for unsupportedMediaTypeError.
func (e unsupportedMediaTypeError) Error() string {
        return "unsupported media type " + e.MediaType
}

// translatePullError is used to convert an error from a registry pull
// operation to an error representing the entire pull operation. Any error
// information which is not used by the returned error gets output to
// log at info level.
func translatePullError(err error, ref reference.Named) error {
        // FIXME(thaJeztah): cleanup error and context handling in this package, as it's really messy.
        if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                return err
        }
        switch v := err.(type) {
        case errcode.Errors:
                if len(v) != 0 {
                        for _, extra := range v[1:] {
                                log.G(context.TODO()).WithError(extra).Infof("Ignoring extra error returned from registry")
                        }
                        return translatePullError(v[0], ref)
                }
        case errcode.Error:
                switch v.Code {
                case errcode.ErrorCodeDenied, v2.ErrorCodeManifestUnknown, v2.ErrorCodeNameUnknown:
                        return notFoundError{v, ref}
                }
        case xfer.DoNotRetry:
                return translatePullError(v.Err, ref)
        }

        return errdefs.Unknown(err)
}

// continueOnError returns true if we should fallback to the next endpoint
// as a result of this error.
func continueOnError(err error, mirrorEndpoint bool) bool {
        // FIXME(thaJeztah): cleanup error and context handling in this package, as it's really messy.
        if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                return false
        }
        switch v := err.(type) {
        case errcode.Errors:
                if len(v) == 0 {
                        return true
                }
                return continueOnError(v[0], mirrorEndpoint)
        case errcode.Error:
                return mirrorEndpoint
        case *client.UnexpectedHTTPResponseError:
                return true
        case imageConfigPullError:
                // imageConfigPullError only happens with v2 images, v1 fallback is
                // unnecessary.
                // Failures from a mirror endpoint should result in fallback to the
                // canonical repo.
                return mirrorEndpoint
        case unsupportedMediaTypeError:
                return false
        case error:
                return !strings.Contains(err.Error(), strings.ToLower(syscall.ESRCH.Error()))
        default:
                // let's be nice and fallback if the error is a completely
                // unexpected one.
                // If new errors have to be handled in some way, please
                // add them to the switch above.
                return true
        }
}

// retryOnError wraps the error in xfer.DoNotRetry if we should not retry the
// operation after this error.
func retryOnError(err error) error {
        switch v := err.(type) {
        case errcode.Errors:
                if len(v) != 0 {
                        return retryOnError(v[0])
                }
        case errcode.Error:
                switch v.Code {
                case errcode.ErrorCodeUnauthorized, errcode.ErrorCodeUnsupported, errcode.ErrorCodeDenied, errcode.ErrorCodeTooManyRequests, v2.ErrorCodeNameUnknown:
                        return xfer.DoNotRetry{Err: err}
                }
        case *url.Error:
                switch {
                case errors.Is(v.Err, auth.ErrNoBasicAuthCredentials), errors.Is(v.Err, auth.ErrNoToken):
                        return xfer.DoNotRetry{Err: v.Err}
                }
                return retryOnError(v.Err)
        case *client.UnexpectedHTTPResponseError, unsupportedMediaTypeError:
                return xfer.DoNotRetry{Err: err}
        case error:
                if errors.Is(err, distribution.ErrBlobUnknown) {
                        return xfer.DoNotRetry{Err: err}
                }
                if strings.Contains(err.Error(), strings.ToLower(syscall.ENOSPC.Error())) {
                        return xfer.DoNotRetry{Err: err}
                }
        }
        // let's be nice and fallback if the error is a completely
        // unexpected one.
        // If new errors have to be handled in some way, please
        // add them to the switch above.
        return err
}

type AIModelNotSupportedError struct{}

func (e AIModelNotSupportedError) Error() string {
        return `AI models are not yet supported by the Engine, please use "docker model pull/run" instead`
}

func (e AIModelNotSupportedError) InvalidParameter() {}

type invalidManifestClassError struct {
        mediaType string
        class     string
}

func (e invalidManifestClassError) Error() string {
        return fmt.Sprintf("Encountered remote %q(%s) when fetching", e.mediaType, e.class)
}

func (e invalidManifestClassError) InvalidParameter() {}

type invalidManifestFormatError struct{}

func (invalidManifestFormatError) Error() string {
        return "unsupported manifest format"
}

func (invalidManifestFormatError) InvalidParameter() {}

type reservedNameError string

func (e reservedNameError) Error() string {
        return "'" + string(e) + "' is a reserved name"
}

func (e reservedNameError) Forbidden() {}

type invalidArgumentErr struct{ error }

func (invalidArgumentErr) InvalidParameter() {}

func DeprecatedSchema1ImageError(ref reference.Named) error {
        msg := "Docker Image Format v1 and Docker Image manifest version 2, schema 1 support has been removed."
        if ref != nil {
                msg += " Suggest the author of " + ref.String() + " to upgrade the image to the OCI Format or Docker Image manifest v2, schema 2."
        }
        msg += " More information at https://docs.docker.com/go/deprecated-image-specs/"
        return invalidArgumentErr{errors.New(msg)}
}

package distribution

import (
        "context"
        "encoding/json"
        "fmt"
        "io"
        "strings"

        "github.com/containerd/containerd/v2/core/content"
        "github.com/containerd/containerd/v2/core/remotes"
        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/distribution/manifest/manifestlist"
        "github.com/docker/distribution/manifest/schema2"
        "github.com/docker/docker/registry"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

// labelDistributionSource describes the source blob comes from.
const labelDistributionSource = "containerd.io/distribution.source"

// This is used by manifestStore to pare down the requirements to implement a
// full distribution.ManifestService, since `Get` is all we use here.
type manifestGetter interface {
        Get(ctx context.Context, dgst digest.Digest, options ...distribution.ManifestServiceOption) (distribution.Manifest, error)
        Exists(ctx context.Context, dgst digest.Digest) (bool, error)
}

type manifestStore struct {
        local  ContentStore
        remote manifestGetter
}

// ContentStore is the interface used to persist registry blobs
//
// Currently this is only used to persist manifests and manifest lists.
// It is exported because `distribution.Pull` takes one as an argument.
type ContentStore interface {
        content.Ingester
        content.Provider
        Info(ctx context.Context, dgst digest.Digest) (content.Info, error)
        Abort(ctx context.Context, ref string) error
        Update(ctx context.Context, info content.Info, fieldpaths ...string) (content.Info, error)
}

func makeDistributionSourceLabel(ref reference.Named) (string, string) {
        domain := reference.Domain(ref)
        if domain == "" {
                domain = registry.DefaultNamespace
        }
        repo := reference.Path(ref)

        return fmt.Sprintf("%s.%s", labelDistributionSource, domain), repo
}

// Taken from https://github.com/containerd/containerd/blob/e079e4a155c86f07bbd602fe6753ecacc78198c2/remotes/docker/handler.go#L84-L108
func appendDistributionSourceLabel(originLabel, repo string) string {
        repos := []string{}
        if originLabel != "" {
                repos = strings.Split(originLabel, ",")
        }
        repos = append(repos, repo)

        // use empty string to present duplicate items
        for i := 1; i < len(repos); i++ {
                tmp, j := repos[i], i-1
                for ; j >= 0 && repos[j] >= tmp; j-- {
                        if repos[j] == tmp {
                                tmp = ""
                        }
                        repos[j+1] = repos[j]
                }
                repos[j+1] = tmp
        }

        i := 0
        for ; i < len(repos) && repos[i] == ""; i++ {
        }

        return strings.Join(repos[i:], ",")
}

func hasDistributionSource(label, repo string) bool {
        sources := strings.Split(label, ",")
        for _, s := range sources {
                if s == repo {
                        return true
                }
        }
        return false
}

func (m *manifestStore) getLocal(ctx context.Context, desc ocispec.Descriptor, ref reference.Named) (distribution.Manifest, error) {
        ra, err := m.local.ReaderAt(ctx, desc)
        if err != nil {
                return nil, errors.Wrap(err, "error getting content store reader")
        }
        defer ra.Close()

        distKey, distRepo := makeDistributionSourceLabel(ref)
        info, err := m.local.Info(ctx, desc.Digest)
        if err != nil {
                return nil, errors.Wrap(err, "error getting content info")
        }

        if _, ok := ref.(reference.Canonical); ok {
                // Since this is specified by digest...
                // We know we have the content locally, we need to check if we've seen this content at the specified repository before.
                // If we have, we can just return the manifest from the local content store.
                // If we haven't, we need to check the remote repository to see if it has the content, otherwise we can end up returning
                // a manifest that has never even existed in the remote before.
                if !hasDistributionSource(info.Labels[distKey], distRepo) {
                        log.G(ctx).WithField("ref", ref).Debug("found manifest but no mataching source repo is listed, checking with remote")
                        exists, err := m.remote.Exists(ctx, desc.Digest)
                        if err != nil {
                                return nil, errors.Wrap(err, "error checking if remote exists")
                        }

                        if !exists {
                                return nil, errors.Wrapf(cerrdefs.ErrNotFound, "manifest %v not found", desc.Digest)
                        }

                }
        }

        // Update the distribution sources since we now know the content exists in the remote.
        if info.Labels == nil {
                info.Labels = map[string]string{}
        }
        info.Labels[distKey] = appendDistributionSourceLabel(info.Labels[distKey], distRepo)
        if _, err := m.local.Update(ctx, info, "labels."+distKey); err != nil {
                log.G(ctx).WithError(err).WithField("ref", ref).Warn("Could not update content distribution source")
        }

        r := io.NewSectionReader(ra, 0, ra.Size())
        data, err := io.ReadAll(r)
        if err != nil {
                return nil, errors.Wrap(err, "error reading manifest from content store")
        }

        manifest, _, err := distribution.UnmarshalManifest(desc.MediaType, data)
        if err != nil {
                return nil, errors.Wrap(err, "error unmarshaling manifest from content store")
        }

        return manifest, nil
}

func (m *manifestStore) getMediaType(ctx context.Context, desc ocispec.Descriptor) (string, error) {
        ra, err := m.local.ReaderAt(ctx, desc)
        if err != nil {
                return "", errors.Wrap(err, "error getting reader to detect media type")
        }
        defer ra.Close()

        mt, err := detectManifestMediaType(ra)
        if err != nil {
                return "", errors.Wrap(err, "error detecting media type")
        }
        return mt, nil
}

func (m *manifestStore) Get(ctx context.Context, desc ocispec.Descriptor, ref reference.Named) (distribution.Manifest, error) {
        l := log.G(ctx)

        if desc.MediaType == "" {
                // When pulling by digest we will not have the media type on the
                // descriptor since we have not made a request to the registry yet
                //
                // We already have the digest, so we only lookup locally... by digest.
                //
                // Let's try to detect the media type so we can have a good ref key
                // here. We may not even have the content locally, and this is fine, but
                // if we do we should determine that.
                mt, err := m.getMediaType(ctx, desc)
                if err != nil && !cerrdefs.IsNotFound(err) {
                        l.WithError(err).Warn("Error looking up media type of content")
                }
                desc.MediaType = mt
        }

        key := remotes.MakeRefKey(ctx, desc)

        // Here we open a writer to the requested content. This both gives us a
        // reference to write to if indeed we need to persist it and increments the
        // ref count on the content.
        w, err := m.local.Writer(ctx, content.WithDescriptor(desc), content.WithRef(key))
        if err != nil {
                if cerrdefs.IsAlreadyExists(err) {
                        var manifest distribution.Manifest
                        if manifest, err = m.getLocal(ctx, desc, ref); err == nil {
                                return manifest, nil
                        }
                }
                // always fallback to the remote if there is an error with the local store
        }
        if w != nil {
                defer w.Close()
        }

        l.WithError(err).Debug("Fetching manifest from remote")

        manifest, err := m.remote.Get(ctx, desc.Digest)
        if err != nil {
                if err := m.local.Abort(ctx, key); err != nil {
                        l.WithError(err).Warn("Error while attempting to abort content ingest")
                }
                return nil, err
        }

        if w != nil {
                // if `w` is nil here, something happened with the content store, so don't bother trying to persist.
                if err := m.Put(ctx, manifest, desc, w, ref); err != nil {
                        if err := m.local.Abort(ctx, key); err != nil {
                                l.WithError(err).Warn("error aborting content ingest")
                        }
                        l.WithError(err).Warn("Error persisting manifest")
                }
        }
        return manifest, nil
}

func (m *manifestStore) Put(ctx context.Context, manifest distribution.Manifest, desc ocispec.Descriptor, w content.Writer, ref reference.Named) error {
        _, payload, err := manifest.Payload()
        if err != nil {
                return err
        }
        desc.Size = int64(len(payload))

        if _, err = w.Write(payload); err != nil {
                return errors.Wrap(err, "error writing manifest to content store")
        }

        distKey, distSource := makeDistributionSourceLabel(ref)
        if err := w.Commit(ctx, desc.Size, desc.Digest, content.WithLabels(map[string]string{
                distKey: distSource,
        })); err != nil {
                return errors.Wrap(err, "error committing manifest to content store")
        }
        return nil
}

func detectManifestMediaType(ra content.ReaderAt) (string, error) {
        dt := make([]byte, ra.Size())
        if _, err := ra.ReadAt(dt, 0); err != nil {
                return "", fmt.Errorf("error detecting media type: %w", err)
        }

        return detectManifestBlobMediaType(dt)
}

const (
        // MediaTypeDockerSchema1Manifest specifies the mediaType for legacy "docker v1" manifests.
        // It is a local copy of [schema1.MediaTypeManifest]. Note that for schema version 1, the media
        // is optionally "application/json".
        //
        // This const is meant for internal use and should not be used externally as
        // it may be removed in a future release.
        //
        // [schema1.MediaTypeManifest]: https://pkg.go.dev/github.com/docker/distribution@v2.8.3+incompatible/manifest/schema1#MediaTypeManifest
        MediaTypeDockerSchema1Manifest = "application/vnd.docker.distribution.manifest.v1+json"

        // MediaTypeDockerSchema1SignedManifest specifies the mediatype for legacy "docker v1" signed manifests.
        // It is a local copy of [schema1.MediaTypeSignedManifest].
        //
        // This const is meant for internal use and should not be used externally as
        // it may be removed in a future release.
        //
        // [schema1.MediaTypeSignedManifest]: https://pkg.go.dev/github.com/docker/distribution@v2.8.3+incompatible/manifest/schema1#MediaTypeSignedManifest
        MediaTypeDockerSchema1SignedManifest = "application/vnd.docker.distribution.manifest.v1+prettyjws"
)

// This is used when the manifest store does not know the media type of a sha it
// was told to get. This would currently only happen when pulling by digest.
// The media type is needed so the blob can be unmarshalled properly.
func detectManifestBlobMediaType(dt []byte) (string, error) {
        var mfst struct {
                MediaType string          `json:"mediaType"`
                Manifests json.RawMessage `json:"manifests"` // oci index, manifest list
                Config    json.RawMessage `json:"config"`    // schema2 Manifest
                Layers    json.RawMessage `json:"layers"`    // schema2 Manifest
                FSLayers  json.RawMessage `json:"fsLayers"`  // schema1 Manifest
        }

        if err := json.Unmarshal(dt, &mfst); err != nil {
                return "", err
        }

        // We may have a media type specified in the json, in which case that should be used.
        // Docker types should generally have a media type set.
        // OCI (golang) types do not have a `mediaType` defined, and it is optional in the spec.
        //
        // `distribution.UnmarshalManifest`, which is used to unmarshal this for real, checks these media type values.
        // If the specified media type does not match it will error, and in some cases (docker media types) it is required.
        // So pretty much if we don't have a media type we can fall back to OCI.
        // This does have a special fallback for schema1 manifests just because it is easy to detect.
        switch mfst.MediaType {
        case schema2.MediaTypeManifest, ocispec.MediaTypeImageManifest:
                if mfst.Manifests != nil || mfst.FSLayers != nil {
                        return "", fmt.Errorf(`media-type: %q should not have "manifests" or "fsLayers"`, mfst.MediaType)
                }
                return mfst.MediaType, nil
        case manifestlist.MediaTypeManifestList, ocispec.MediaTypeImageIndex:
                if mfst.Config != nil || mfst.Layers != nil || mfst.FSLayers != nil {
                        return "", fmt.Errorf(`media-type: %q should not have "config", "layers", or "fsLayers"`, mfst.MediaType)
                }
                return mfst.MediaType, nil
        case MediaTypeDockerSchema1Manifest, MediaTypeDockerSchema1SignedManifest:
                return "", DeprecatedSchema1ImageError(nil)
        default:
                if mfst.MediaType != "" {
                        return mfst.MediaType, nil
                }
        }
        switch {
        case mfst.FSLayers != nil && mfst.Manifests == nil && mfst.Layers == nil && mfst.Config == nil:
                return "", DeprecatedSchema1ImageError(nil)
        case mfst.Config != nil && mfst.Manifests == nil && mfst.FSLayers == nil,
                mfst.Layers != nil && mfst.Manifests == nil && mfst.FSLayers == nil:
                return ocispec.MediaTypeImageManifest, nil
        case mfst.Config == nil && mfst.Layers == nil && mfst.FSLayers == nil:
                // fallback to index
                return ocispec.MediaTypeImageIndex, nil
        }
        return "", errors.New("media-type: cannot determine")
}

package metadata

import (
        "os"
        "path/filepath"
        "sync"

        "github.com/moby/sys/atomicwriter"
)

// Store implements a K/V store for mapping distribution-related IDs
// to on-disk layer IDs and image IDs. The namespace identifies the type of
// mapping (i.e. "v1ids" or "artifacts"). MetadataStore is goroutine-safe.
type Store interface {
        // Get retrieves data by namespace and key.
        Get(namespace string, key string) ([]byte, error)
        // Set writes data indexed by namespace and key.
        Set(namespace, key string, value []byte) error
        // Delete removes data indexed by namespace and key.
        Delete(namespace, key string) error
}

// FSMetadataStore uses the filesystem to associate metadata with layer and
// image IDs.
type FSMetadataStore struct {
        sync.RWMutex
        basePath string
}

// NewFSMetadataStore creates a new filesystem-based metadata store.
func NewFSMetadataStore(basePath string) (*FSMetadataStore, error) {
        if err := os.MkdirAll(basePath, 0o700); err != nil {
                return nil, err
        }
        return &FSMetadataStore{
                basePath: basePath,
        }, nil
}

func (store *FSMetadataStore) path(namespace, key string) string {
        return filepath.Join(store.basePath, namespace, key)
}

// Get retrieves data by namespace and key. The data is read from a file named
// after the key, stored in the namespace's directory.
func (store *FSMetadataStore) Get(namespace string, key string) ([]byte, error) {
        store.RLock()
        defer store.RUnlock()

        return os.ReadFile(store.path(namespace, key))
}

// Set writes data indexed by namespace and key. The data is written to a file
// named after the key, stored in the namespace's directory.
func (store *FSMetadataStore) Set(namespace, key string, value []byte) error {
        store.Lock()
        defer store.Unlock()

        path := store.path(namespace, key)
        if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
                return err
        }
        return atomicwriter.WriteFile(path, value, 0o644)
}

// Delete removes data indexed by namespace and key. The data file named after
// the key, stored in the namespace's directory is deleted.
func (store *FSMetadataStore) Delete(namespace, key string) error {
        store.Lock()
        defer store.Unlock()

        path := store.path(namespace, key)
        return os.Remove(path)
}

package metadata

import (
        "crypto/hmac"
        "crypto/sha256"
        "encoding/hex"
        "encoding/json"
        "errors"

        "github.com/docker/docker/layer"
        "github.com/moby/moby/api/types/registry"
        "github.com/opencontainers/go-digest"
)

// V2MetadataService maps layer IDs to a set of known metadata for
// the layer.
type V2MetadataService interface {
        GetMetadata(diffID layer.DiffID) ([]V2Metadata, error)
        GetDiffID(dgst digest.Digest) (layer.DiffID, error)
        Add(diffID layer.DiffID, metadata V2Metadata) error
        TagAndAdd(diffID layer.DiffID, hmacKey []byte, metadata V2Metadata) error
        Remove(metadata V2Metadata) error
}

// v2MetadataService implements V2MetadataService
type v2MetadataService struct {
        store Store
}

var _ V2MetadataService = &v2MetadataService{}

// V2Metadata contains the digest and source repository information for a layer.
type V2Metadata struct {
        Digest           digest.Digest
        SourceRepository string
        // HMAC hashes above attributes with recent authconfig digest used as a key in order to determine matching
        // metadata entries accompanied by the same credentials without actually exposing them.
        HMAC string
}

// CheckV2MetadataHMAC returns true if the given "meta" is tagged with a hmac hashed by the given "key".
func CheckV2MetadataHMAC(meta *V2Metadata, key []byte) bool {
        if meta.HMAC == "" || len(key) == 0 {
                return meta.HMAC == "" && len(key) == 0
        }
        mac := hmac.New(sha256.New, key)
        mac.Write([]byte(meta.Digest))
        mac.Write([]byte(meta.SourceRepository))
        expectedMac := mac.Sum(nil)

        storedMac, err := hex.DecodeString(meta.HMAC)
        if err != nil {
                return false
        }

        return hmac.Equal(storedMac, expectedMac)
}

// ComputeV2MetadataHMAC returns a hmac for the given "meta" hash by the given key.
func ComputeV2MetadataHMAC(key []byte, meta *V2Metadata) string {
        if len(key) == 0 || meta == nil {
                return ""
        }
        mac := hmac.New(sha256.New, key)
        mac.Write([]byte(meta.Digest))
        mac.Write([]byte(meta.SourceRepository))
        return hex.EncodeToString(mac.Sum(nil))
}

// ComputeV2MetadataHMACKey returns a key for the given "authConfig" that can be used to hash v2 metadata
// entries.
func ComputeV2MetadataHMACKey(authConfig *registry.AuthConfig) ([]byte, error) {
        if authConfig == nil {
                return nil, nil
        }
        key := authConfigKeyInput{
                Username:      authConfig.Username,
                Password:      authConfig.Password,
                Auth:          authConfig.Auth,
                IdentityToken: authConfig.IdentityToken,
                RegistryToken: authConfig.RegistryToken,
        }
        buf, err := json.Marshal(&key)
        if err != nil {
                return nil, err
        }
        return []byte(digest.FromBytes(buf)), nil
}

// authConfigKeyInput is a reduced AuthConfig structure holding just relevant credential data eligible for
// hmac key creation.
type authConfigKeyInput struct {
        Username string `json:"username,omitempty"`
        Password string `json:"password,omitempty"`
        Auth     string `json:"auth,omitempty"`

        IdentityToken string `json:"identitytoken,omitempty"`
        RegistryToken string `json:"registrytoken,omitempty"`
}

// maxMetadata is the number of metadata entries to keep per layer DiffID.
const maxMetadata = 50

// NewV2MetadataService creates a new diff ID to v2 metadata mapping service.
func NewV2MetadataService(store Store) V2MetadataService {
        return &v2MetadataService{
                store: store,
        }
}

func (serv *v2MetadataService) diffIDNamespace() string {
        return "v2metadata-by-diffid"
}

func (serv *v2MetadataService) digestNamespace() string {
        return "diffid-by-digest"
}

func (serv *v2MetadataService) diffIDKey(diffID layer.DiffID) string {
        return string(diffID.Algorithm()) + "/" + diffID.Encoded()
}

func (serv *v2MetadataService) digestKey(dgst digest.Digest) string {
        return string(dgst.Algorithm()) + "/" + dgst.Encoded()
}

// GetMetadata finds the metadata associated with a layer DiffID.
func (serv *v2MetadataService) GetMetadata(diffID layer.DiffID) ([]V2Metadata, error) {
        if serv.store == nil {
                return nil, errors.New("no metadata storage")
        }
        jsonBytes, err := serv.store.Get(serv.diffIDNamespace(), serv.diffIDKey(diffID))
        if err != nil {
                return nil, err
        }

        var metadata []V2Metadata
        if err := json.Unmarshal(jsonBytes, &metadata); err != nil {
                return nil, err
        }

        return metadata, nil
}

// GetDiffID finds a layer DiffID from a digest.
func (serv *v2MetadataService) GetDiffID(dgst digest.Digest) (layer.DiffID, error) {
        if serv.store == nil {
                return "", errors.New("no metadata storage")
        }
        diffIDBytes, err := serv.store.Get(serv.digestNamespace(), serv.digestKey(dgst))
        if err != nil {
                return "", err
        }

        return layer.DiffID(diffIDBytes), nil
}

// Add associates metadata with a layer DiffID. If too many metadata entries are
// present, the oldest one is dropped.
func (serv *v2MetadataService) Add(diffID layer.DiffID, metadata V2Metadata) error {
        if serv.store == nil {
                // Support a service which has no backend storage, in this case
                // an add becomes a no-op.
                // TODO: implement in memory storage
                return nil
        }
        oldMetadata, err := serv.GetMetadata(diffID)
        if err != nil {
                oldMetadata = nil
        }
        newMetadata := make([]V2Metadata, 0, len(oldMetadata)+1)

        // Copy all other metadata to new slice
        for _, oldMeta := range oldMetadata {
                if oldMeta != metadata {
                        newMetadata = append(newMetadata, oldMeta)
                }
        }

        newMetadata = append(newMetadata, metadata)

        if len(newMetadata) > maxMetadata {
                newMetadata = newMetadata[len(newMetadata)-maxMetadata:]
        }

        jsonBytes, err := json.Marshal(newMetadata)
        if err != nil {
                return err
        }

        err = serv.store.Set(serv.diffIDNamespace(), serv.diffIDKey(diffID), jsonBytes)
        if err != nil {
                return err
        }

        return serv.store.Set(serv.digestNamespace(), serv.digestKey(metadata.Digest), []byte(diffID))
}

// TagAndAdd amends the given "meta" for hmac hashed by the given "hmacKey" and associates it with a layer
// DiffID. If too many metadata entries are present, the oldest one is dropped.
func (serv *v2MetadataService) TagAndAdd(diffID layer.DiffID, hmacKey []byte, meta V2Metadata) error {
        meta.HMAC = ComputeV2MetadataHMAC(hmacKey, &meta)
        return serv.Add(diffID, meta)
}

// Remove disassociates a metadata entry from a layer DiffID.
func (serv *v2MetadataService) Remove(metadata V2Metadata) error {
        if serv.store == nil {
                // Support a service which has no backend storage, in this case
                // an remove becomes a no-op.
                // TODO: implement in memory storage
                return nil
        }
        diffID, err := serv.GetDiffID(metadata.Digest)
        if err != nil {
                return err
        }
        oldMetadata, err := serv.GetMetadata(diffID)
        if err != nil {
                oldMetadata = nil
        }
        newMetadata := make([]V2Metadata, 0, len(oldMetadata))

        // Copy all other metadata to new slice
        for _, oldMeta := range oldMetadata {
                if oldMeta != metadata {
                        newMetadata = append(newMetadata, oldMeta)
                }
        }

        if len(newMetadata) == 0 {
                return serv.store.Delete(serv.diffIDNamespace(), serv.diffIDKey(diffID))
        }

        jsonBytes, err := json.Marshal(newMetadata)
        if err != nil {
                return err
        }

        return serv.store.Set(serv.diffIDNamespace(), serv.diffIDKey(diffID), jsonBytes)
}

package distribution

import (
        "context"
        "fmt"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        refstore "github.com/docker/docker/reference"
        "github.com/docker/docker/registry"
        "github.com/moby/moby/api/types/events"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

// Pull initiates a pull operation. image is the repository name to pull, and
// tag may be either empty, or indicate a specific tag to pull.
func Pull(ctx context.Context, ref reference.Named, config *ImagePullConfig, local ContentStore) error {
        repoName, err := pullEndpoints(ctx, config.RegistryService, ref, func(ctx context.Context, repoName reference.Named, endpoint registry.APIEndpoint) error {
                log.G(ctx).Debugf("Trying to pull %s from %s", reference.FamiliarName(repoName), endpoint.URL)
                return newPuller(endpoint, repoName, config, local).pull(ctx, ref)
        })

        if err == nil {
                config.ImageEventLogger(ctx, reference.FamiliarString(ref), reference.FamiliarName(repoName), events.ActionPull)
        }

        return err
}

// Tags returns available tags for the given image in the remote repository.
func Tags(ctx context.Context, ref reference.Named, config *Config) ([]string, error) {
        var tags []string
        _, err := pullEndpoints(ctx, config.RegistryService, ref, func(ctx context.Context, repoName reference.Named, endpoint registry.APIEndpoint) error {
                repo, err := newRepository(ctx, repoName, endpoint, config.MetaHeaders, config.AuthConfig, "pull")
                if err != nil {
                        return err
                }

                tags, err = repo.Tags(ctx).All(ctx)
                return err
        })

        return tags, err
}

// noBaseImageSpecifier is the symbol used by the FROM
// command to specify that no base image is to be used.
const noBaseImageSpecifier = "scratch"

// validateRepoName validates the name of a repository.
func validateRepoName(name reference.Named) error {
        if reference.FamiliarName(name) == noBaseImageSpecifier {
                return errors.WithStack(reservedNameError(noBaseImageSpecifier))
        }
        return nil
}

func addDigestReference(store refstore.Store, ref reference.Named, dgst digest.Digest, id digest.Digest) error {
        dgstRef, err := reference.WithDigest(reference.TrimNamed(ref), dgst)
        if err != nil {
                return err
        }

        if oldTagID, err := store.Get(dgstRef); err == nil {
                if oldTagID != id {
                        // Updating digests not supported by reference store
                        log.G(context.TODO()).Errorf("Image ID for digest %s changed from %s to %s, cannot update", dgst.String(), oldTagID, id)
                }
                return nil
        } else if !errors.Is(err, refstore.ErrDoesNotExist) {
                return err
        }

        return store.AddDigest(dgstRef, id, true)
}

func pullEndpoints(ctx context.Context, registryService RegistryResolver, ref reference.Named,
        f func(context.Context, reference.Named, registry.APIEndpoint) error,
) (reference.Named, error) {
        repoName := reference.TrimNamed(ref)

        // makes sure name is not `scratch`
        if err := validateRepoName(repoName); err != nil {
                return repoName, err
        }

        endpoints, err := registryService.LookupPullEndpoints(reference.Domain(repoName))
        if err != nil {
                return repoName, err
        }

        var (
                lastErr error

                // confirmedTLSRegistries is a map indicating which registries
                // are known to be using TLS. There should never be a plaintext
                // retry for any of these.
                confirmedTLSRegistries = make(map[string]struct{})
        )
        for _, endpoint := range endpoints {
                if endpoint.URL.Scheme != "https" {
                        if _, confirmedTLS := confirmedTLSRegistries[endpoint.URL.Host]; confirmedTLS {
                                log.G(ctx).Debugf("Skipping non-TLS endpoint %s for host/port that appears to use TLS", endpoint.URL)
                                continue
                        }
                }

                log.G(ctx).Debugf("Trying to pull %s from %s", reference.FamiliarName(repoName), endpoint.URL)

                if err := f(ctx, repoName, endpoint); err != nil {
                        if _, ok := err.(fallbackError); !ok && continueOnError(err, endpoint.Mirror) {
                                err = fallbackError{
                                        err:         err,
                                        transportOK: true,
                                }
                        }

                        // Was this pull cancelled? If so, don't try to fall
                        // back.
                        fallback := false
                        select {
                        case <-ctx.Done():
                        default:
                                if fallbackErr, ok := err.(fallbackError); ok {
                                        fallback = true
                                        if fallbackErr.transportOK && endpoint.URL.Scheme == "https" {
                                                confirmedTLSRegistries[endpoint.URL.Host] = struct{}{}
                                        }
                                        err = fallbackErr.err
                                }
                        }
                        if fallback {
                                lastErr = err
                                log.G(ctx).Infof("Attempting next endpoint for pull after error: %v", err)
                                continue
                        }
                        // FIXME(thaJeztah): cleanup error and context handling in this package, as it's really messy.
                        if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                                log.G(ctx).WithError(err).Info("Not continuing with pull after error")
                        } else {
                                log.G(ctx).WithError(err).Error("Not continuing with pull after error")
                        }
                        return repoName, translatePullError(err, ref)
                }

                return repoName, nil
        }

        if lastErr == nil {
                lastErr = fmt.Errorf("no endpoints found for %s", reference.FamiliarString(ref))
        }

        return repoName, translatePullError(lastErr, ref)
}

package distribution

import (
        "context"
        "fmt"
        "io"
        "os"
        "runtime"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/distribution/manifest/manifestlist"
        "github.com/docker/distribution/manifest/ocischema"
        "github.com/docker/distribution/manifest/schema2"
        "github.com/docker/distribution/registry/client/transport"
        "github.com/docker/docker/distribution/metadata"
        "github.com/docker/docker/distribution/xfer"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/stringid"
        refstore "github.com/docker/docker/reference"
        "github.com/docker/docker/registry"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
        "github.com/tonistiigi/go-archvariant"
)

var (
        errRootFSMismatch = errors.New("layers from manifest don't match image configuration")
        errRootFSInvalid  = errors.New("invalid rootfs in image configuration")
)

// imageConfigPullError is an error pulling the image config blob
// (only applies to schema2).
type imageConfigPullError struct {
        Err error
}

// Error returns the error string for imageConfigPullError.
func (e imageConfigPullError) Error() string {
        return "error pulling image configuration: " + e.Err.Error()
}

// newPuller returns a puller to pull from a v2 registry.
func newPuller(endpoint registry.APIEndpoint, repoName reference.Named, config *ImagePullConfig, local ContentStore) *puller {
        return &puller{
                metadataService: metadata.NewV2MetadataService(config.MetadataStore),
                endpoint:        endpoint,
                config:          config,
                repoName:        repoName,
                manifestStore: &manifestStore{
                        local: local,
                },
        }
}

type puller struct {
        metadataService metadata.V2MetadataService
        endpoint        registry.APIEndpoint
        config          *ImagePullConfig
        repoName        reference.Named
        repo            distribution.Repository
        manifestStore   *manifestStore
}

func (p *puller) pull(ctx context.Context, ref reference.Named) error {
        var err error
        // TODO(thaJeztah): do we need p.repoName at all, as it would probably be same as ref?
        p.repo, err = newRepository(ctx, p.repoName, p.endpoint, p.config.MetaHeaders, p.config.AuthConfig, "pull")
        if err != nil {
                log.G(ctx).Warnf("Error getting v2 registry: %v", err)
                return err
        }

        p.manifestStore.remote, err = p.repo.Manifests(ctx)
        if err != nil {
                return err
        }

        return p.pullRepository(ctx, ref)
}

func (p *puller) pullRepository(ctx context.Context, ref reference.Named) error {
        var layersDownloaded bool
        if !reference.IsNameOnly(ref) {
                var err error
                layersDownloaded, err = p.pullTag(ctx, ref, p.config.Platform)
                if err != nil {
                        return err
                }
        } else {
                tags, err := p.repo.Tags(ctx).All(ctx)
                if err != nil {
                        return err
                }

                for _, tag := range tags {
                        tagRef, err := reference.WithTag(ref, tag)
                        if err != nil {
                                return err
                        }
                        pulledNew, err := p.pullTag(ctx, tagRef, p.config.Platform)
                        if err != nil {
                                // Since this is the pull-all-tags case, don't
                                // allow an error pulling a particular tag to
                                // make the whole pull fall back to v1.
                                if fallbackErr, ok := err.(fallbackError); ok {
                                        return fallbackErr.err
                                }
                                return err
                        }
                        // pulledNew is true if either new layers were downloaded OR if existing images were newly tagged
                        // TODO(tiborvass): should we change the name of `layersDownload`? What about message in WriteStatus?
                        layersDownloaded = layersDownloaded || pulledNew
                }
        }

        p.writeStatus(reference.FamiliarString(ref), layersDownloaded)

        return nil
}

// writeStatus writes a status message to out. If layersDownloaded is true, the
// status message indicates that a newer image was downloaded. Otherwise, it
// indicates that the image is up to date. requestedTag is the tag the message
// will refer to.
func (p *puller) writeStatus(requestedTag string, layersDownloaded bool) {
        if layersDownloaded {
                progress.Message(p.config.ProgressOutput, "", "Status: Downloaded newer image for "+requestedTag)
        } else {
                progress.Message(p.config.ProgressOutput, "", "Status: Image is up to date for "+requestedTag)
        }
}

type layerDescriptor struct {
        digest          digest.Digest
        diffID          layer.DiffID
        repoName        reference.Named
        repo            distribution.Repository
        metadataService metadata.V2MetadataService
        tmpFile         *os.File
        verifier        digest.Verifier
        src             distribution.Descriptor
}

func (ld *layerDescriptor) Key() string {
        return "v2:" + ld.digest.String()
}

func (ld *layerDescriptor) ID() string {
        return stringid.TruncateID(ld.digest.String())
}

func (ld *layerDescriptor) DiffID() (layer.DiffID, error) {
        if ld.diffID != "" {
                return ld.diffID, nil
        }
        return ld.metadataService.GetDiffID(ld.digest)
}

func (ld *layerDescriptor) Download(ctx context.Context, progressOutput progress.Output) (io.ReadCloser, int64, error) {
        log.G(ctx).Debugf("pulling blob %q", ld.digest)

        var (
                err    error
                offset int64
        )

        if ld.tmpFile == nil {
                ld.tmpFile, err = createDownloadFile()
                if err != nil {
                        return nil, 0, xfer.DoNotRetry{Err: err}
                }
        } else {
                offset, err = ld.tmpFile.Seek(0, io.SeekEnd)
                if err != nil {
                        log.G(ctx).Debugf("error seeking to end of download file: %v", err)
                        offset = 0

                        _ = ld.tmpFile.Close()
                        if err := os.Remove(ld.tmpFile.Name()); err != nil {
                                log.G(ctx).Errorf("Failed to remove temp file: %s", ld.tmpFile.Name())
                        }
                        ld.tmpFile, err = createDownloadFile()
                        if err != nil {
                                return nil, 0, xfer.DoNotRetry{Err: err}
                        }
                } else if offset != 0 {
                        log.G(ctx).Debugf("attempting to resume download of %q from %d bytes", ld.digest, offset)
                }
        }

        tmpFile := ld.tmpFile

        layerDownload, err := ld.open(ctx)
        if err != nil {
                log.G(ctx).Errorf("Error initiating layer download: %v", err)
                return nil, 0, retryOnError(err)
        }

        if offset != 0 {
                _, err := layerDownload.Seek(offset, io.SeekStart)
                if err != nil {
                        if err := ld.truncateDownloadFile(); err != nil {
                                return nil, 0, xfer.DoNotRetry{Err: err}
                        }
                        return nil, 0, err
                }
        }
        size, err := layerDownload.Seek(0, io.SeekEnd)
        if err != nil {
                // Seek failed, perhaps because there was no Content-Length
                // header. This shouldn't fail the download, because we can
                // still continue without a progress bar.
                size = 0
        } else {
                if size != 0 && offset > size {
                        log.G(ctx).Debug("Partial download is larger than full blob. Starting over")
                        offset = 0
                        if err := ld.truncateDownloadFile(); err != nil {
                                return nil, 0, xfer.DoNotRetry{Err: err}
                        }
                }

                // Restore the seek offset either at the beginning of the
                // stream, or just after the last byte we have from previous
                // attempts.
                _, err = layerDownload.Seek(offset, io.SeekStart)
                if err != nil {
                        return nil, 0, err
                }
        }

        reader := progress.NewProgressReader(ioutils.NewCancelReadCloser(ctx, layerDownload), progressOutput, size-offset, ld.ID(), "Downloading")
        defer reader.Close()

        if ld.verifier == nil {
                ld.verifier = ld.digest.Verifier()
        }

        _, err = io.Copy(tmpFile, io.TeeReader(reader, ld.verifier))
        if err != nil {
                if errors.Is(err, transport.ErrWrongCodeForByteRange) {
                        if err := ld.truncateDownloadFile(); err != nil {
                                return nil, 0, xfer.DoNotRetry{Err: err}
                        }
                        return nil, 0, err
                }
                return nil, 0, retryOnError(err)
        }

        progress.Update(progressOutput, ld.ID(), "Verifying Checksum")

        if !ld.verifier.Verified() {
                err = fmt.Errorf("filesystem layer verification failed for digest %s", ld.digest)
                log.G(ctx).Error(err)

                // Allow a retry if this digest verification error happened
                // after a resumed download.
                if offset != 0 {
                        if err := ld.truncateDownloadFile(); err != nil {
                                return nil, 0, xfer.DoNotRetry{Err: err}
                        }

                        return nil, 0, err
                }
                return nil, 0, xfer.DoNotRetry{Err: err}
        }

        progress.Update(progressOutput, ld.ID(), "Download complete")

        log.G(ctx).Debugf("Downloaded %s to tempfile %s", ld.ID(), tmpFile.Name())

        _, err = tmpFile.Seek(0, io.SeekStart)
        if err != nil {
                _ = tmpFile.Close()
                if err := os.Remove(tmpFile.Name()); err != nil {
                        log.G(ctx).Errorf("Failed to remove temp file: %s", tmpFile.Name())
                }
                ld.tmpFile = nil
                ld.verifier = nil
                return nil, 0, xfer.DoNotRetry{Err: err}
        }

        // hand off the temporary file to the download manager, so it will only
        // be closed once
        ld.tmpFile = nil

        return ioutils.NewReadCloserWrapper(tmpFile, func() error {
                _ = tmpFile.Close()
                err := os.RemoveAll(tmpFile.Name())
                if err != nil {
                        log.G(ctx).Errorf("Failed to remove temp file: %s", tmpFile.Name())
                }
                return err
        }), size, nil
}

func (ld *layerDescriptor) Close() {
        if ld.tmpFile != nil {
                _ = ld.tmpFile.Close()
                if err := os.RemoveAll(ld.tmpFile.Name()); err != nil {
                        log.G(context.TODO()).Errorf("Failed to remove temp file: %s", ld.tmpFile.Name())
                }
        }
}

func (ld *layerDescriptor) truncateDownloadFile() error {
        // Need a new hash context since we will be redoing the download
        ld.verifier = nil

        if _, err := ld.tmpFile.Seek(0, io.SeekStart); err != nil {
                log.G(context.TODO()).Errorf("error seeking to beginning of download file: %v", err)
                return err
        }

        if err := ld.tmpFile.Truncate(0); err != nil {
                log.G(context.TODO()).Errorf("error truncating download file: %v", err)
                return err
        }

        return nil
}

func (ld *layerDescriptor) Registered(diffID layer.DiffID) {
        // Cache mapping from this layer's DiffID to the blobsum
        _ = ld.metadataService.Add(diffID, metadata.V2Metadata{Digest: ld.digest, SourceRepository: ld.repoName.Name()})
}

func (p *puller) pullTag(ctx context.Context, ref reference.Named, platform *ocispec.Platform) (tagUpdated bool, _ error) {
        var (
                tagOrDigest string // Used for logging/progress only
                dgst        digest.Digest
                mt          string
                size        int64
        )
        if digested, isDigested := ref.(reference.Canonical); isDigested {
                dgst = digested.Digest()
                tagOrDigest = digested.String()
        } else if tagged, isTagged := ref.(reference.NamedTagged); isTagged {
                tagService := p.repo.Tags(ctx)
                desc, err := tagService.Get(ctx, tagged.Tag())
                if err != nil {
                        return false, err
                }
                dgst = desc.Digest
                tagOrDigest = tagged.Tag()
                mt = desc.MediaType
                size = desc.Size
        } else {
                return false, fmt.Errorf("internal error: reference has neither a tag nor a digest: %s", reference.FamiliarString(ref))
        }

        ctx = log.WithLogger(ctx, log.G(ctx).WithFields(log.Fields{
                "digest": dgst,
                "remote": ref,
        }))

        manifest, err := p.manifestStore.Get(ctx, ocispec.Descriptor{
                MediaType: mt,
                Digest:    dgst,
                Size:      size,
        }, ref)
        if err != nil {
                return false, err
        }

        if manifest == nil {
                return false, fmt.Errorf("image manifest does not exist for tag or digest %q", tagOrDigest)
        }

        if m, ok := manifest.(*schema2.DeserializedManifest); ok {
                if err := p.validateMediaType(m.Manifest.Config.MediaType); err != nil {
                        return false, err
                }
        }

        log.G(ctx).Debugf("Pulling ref from V2 registry: %s", reference.FamiliarString(ref))
        progress.Message(p.config.ProgressOutput, tagOrDigest, "Pulling from "+reference.FamiliarName(p.repo.Named()))

        var (
                id             digest.Digest
                manifestDigest digest.Digest
        )

        switch v := manifest.(type) {
        case *schema2.DeserializedManifest:
                id, manifestDigest, err = p.pullSchema2(ctx, ref, v, platform)
                if err != nil {
                        return false, err
                }
        case *ocischema.DeserializedManifest:
                id, manifestDigest, err = p.pullOCI(ctx, ref, v, platform)
                if err != nil {
                        return false, err
                }
        case *manifestlist.DeserializedManifestList:
                id, manifestDigest, err = p.pullManifestList(ctx, ref, v, platform)
                if err != nil {
                        return false, err
                }
        default:
                mediaType, _, _ := manifest.Payload()
                switch mediaType {
                case MediaTypeDockerSchema1Manifest, MediaTypeDockerSchema1SignedManifest:
                        return false, DeprecatedSchema1ImageError(ref)
                }
                return false, invalidManifestFormatError{}
        }

        progress.Message(p.config.ProgressOutput, "", "Digest: "+manifestDigest.String())

        if p.config.ReferenceStore != nil {
                oldTagID, err := p.config.ReferenceStore.Get(ref)
                if err == nil {
                        if oldTagID == id {
                                return false, addDigestReference(p.config.ReferenceStore, ref, manifestDigest, id)
                        }
                } else if !errors.Is(err, refstore.ErrDoesNotExist) {
                        return false, err
                }

                if canonical, ok := ref.(reference.Canonical); ok {
                        if err := p.config.ReferenceStore.AddDigest(canonical, id, true); err != nil {
                                return false, err
                        }
                } else {
                        if err := addDigestReference(p.config.ReferenceStore, ref, manifestDigest, id); err != nil {
                                return false, err
                        }
                        if err := p.config.ReferenceStore.AddTag(ref, id, true); err != nil {
                                return false, err
                        }
                }
        }
        return true, nil
}

// validateMediaType validates if the given mediaType is accepted by the puller's
// configuration.
func (p *puller) validateMediaType(mediaType string) error {
        var allowedMediaTypes []string
        if len(p.config.Schema2Types) > 0 {
                allowedMediaTypes = p.config.Schema2Types
        } else {
                allowedMediaTypes = defaultImageTypes
        }
        for _, t := range allowedMediaTypes {
                if mediaType == t {
                        return nil
                }
        }

        configClass := mediaTypeClasses[mediaType]
        if configClass == "" {
                configClass = "unknown"
        }
        return invalidManifestClassError{mediaType, configClass}
}

func checkSupportedMediaType(mediaType string) error {
        lowerMt := strings.ToLower(mediaType)
        if strings.HasPrefix(lowerMt, "application/vnd.docker.ai.") {
                return AIModelNotSupportedError{}
        }
        for _, mt := range supportedMediaTypes {
                // The should either be an exact match, or have a valid prefix
                // we append a "." when matching prefixes to exclude "false positives";
                // for example, we don't want to match "application/vnd.oci.images_are_fun_yolo".
                if lowerMt == mt || strings.HasPrefix(lowerMt, mt+".") {
                        return nil
                }
        }
        return unsupportedMediaTypeError{MediaType: mediaType}
}

func (p *puller) pullSchema2Layers(ctx context.Context, target distribution.Descriptor, layers []distribution.Descriptor, platform *ocispec.Platform) (id digest.Digest, _ error) {
        if _, err := p.config.ImageStore.Get(ctx, target.Digest); err == nil {
                // If the image already exists locally, no need to pull
                // anything.
                return target.Digest, nil
        }

        if err := checkSupportedMediaType(target.MediaType); err != nil {
                return "", err
        }

        var descriptors []xfer.DownloadDescriptor

        // Note that the order of this loop is in the direction of bottom-most
        // to top-most, so that the downloads slice gets ordered correctly.
        for _, d := range layers {
                if err := d.Digest.Validate(); err != nil {
                        return "", errors.Wrapf(err, "could not validate layer digest %q", d.Digest)
                }
                if err := checkSupportedMediaType(d.MediaType); err != nil {
                        return "", err
                }
                descriptors = append(descriptors, &layerDescriptor{
                        digest:          d.Digest,
                        repo:            p.repo,
                        repoName:        p.repoName,
                        metadataService: p.metadataService,
                        src:             d,
                })
        }

        configChan := make(chan []byte, 1)
        configErrChan := make(chan error, 1)
        layerErrChan := make(chan error, 1)
        downloadsDone := make(chan struct{})
        var cancel func()
        ctx, cancel = context.WithCancel(ctx)
        defer cancel()

        // Pull the image config
        go func() {
                configJSON, err := p.pullSchema2Config(ctx, target.Digest)
                if err != nil {
                        configErrChan <- imageConfigPullError{Err: err}
                        cancel()
                        return
                }
                configChan <- configJSON
        }()

        var (
                configJSON       []byte            // raw serialized image config
                downloadedRootFS *image.RootFS     // rootFS from registered layers
                configRootFS     *image.RootFS     // rootFS from configuration
                release          func()            // release resources from rootFS download
                configPlatform   *ocispec.Platform // for LCOW when registering downloaded layers
        )

        layerStoreOS := runtime.GOOS
        if platform != nil {
                layerStoreOS = platform.OS
        }

        // https://github.com/docker/docker/issues/24766 - Err on the side of caution,
        // explicitly blocking images intended for linux from the Windows daemon. On
        // Windows, we do this before the attempt to download, effectively serialising
        // the download slightly slowing it down. We have to do it this way, as
        // chances are the download of layers itself would fail due to file names
        // which aren't suitable for NTFS. At some point in the future, if a similar
        // check to block Windows images being pulled on Linux is implemented, it
        // may be necessary to perform the same type of serialisation.
        if runtime.GOOS == "windows" {
                var err error
                configJSON, configRootFS, configPlatform, err = receiveConfig(configChan, configErrChan)
                if err != nil {
                        return "", err
                }
                if configRootFS == nil {
                        return "", errRootFSInvalid
                }
                if err := checkImageCompatibility(configPlatform.OS, configPlatform.OSVersion); err != nil {
                        return "", err
                }

                if len(descriptors) != len(configRootFS.DiffIDs) {
                        return "", errRootFSMismatch
                }
                if platform == nil {
                        // Early bath if the requested OS doesn't match that of the configuration.
                        // This avoids doing the download, only to potentially fail later.
                        if err := image.CheckOS(configPlatform.OS); err != nil {
                                return "", fmt.Errorf("cannot download image with operating system %q when requesting %q", configPlatform.OS, layerStoreOS)
                        }
                        layerStoreOS = configPlatform.OS
                }

                // Populate diff ids in descriptors to avoid downloading foreign layers
                // which have been side loaded
                for i := range descriptors {
                        descriptors[i].(*layerDescriptor).diffID = configRootFS.DiffIDs[i]
                }
        }

        // Assume that the operating system is the host OS if blank, and validate it
        // to ensure we don't cause a panic by an invalid index into the layerstores.
        if layerStoreOS != "" {
                if err := image.CheckOS(layerStoreOS); err != nil {
                        return "", err
                }
        }

        if p.config.DownloadManager != nil {
                go func() {
                        var (
                                err    error
                                rootFS image.RootFS
                        )
                        rootFS, release, err = p.config.DownloadManager.Download(ctx, descriptors, p.config.ProgressOutput)
                        if err != nil {
                                // Intentionally do not cancel the config download here
                                // as the error from config download (if there is one)
                                // is more interesting than the layer download error
                                layerErrChan <- err
                                return
                        }

                        downloadedRootFS = &rootFS
                        close(downloadsDone)
                }()
        } else {
                // We have nothing to download
                close(downloadsDone)
        }

        if configJSON == nil {
                var err error
                configJSON, configRootFS, _, err = receiveConfig(configChan, configErrChan)
                if err == nil && configRootFS == nil {
                        err = errRootFSInvalid
                }
                if err != nil {
                        cancel()
                        select {
                        case <-downloadsDone:
                        case <-layerErrChan:
                        }
                        return "", err
                }
        }

        select {
        case <-downloadsDone:
        case err := <-layerErrChan:
                return "", err
        }

        if release != nil {
                defer release()
        }

        if downloadedRootFS != nil {
                // The DiffIDs returned in rootFS MUST match those in the config.
                // Otherwise the image config could be referencing layers that aren't
                // included in the manifest.
                if len(downloadedRootFS.DiffIDs) != len(configRootFS.DiffIDs) {
                        return "", errRootFSMismatch
                }

                for i := range downloadedRootFS.DiffIDs {
                        if downloadedRootFS.DiffIDs[i] != configRootFS.DiffIDs[i] {
                                return "", errRootFSMismatch
                        }
                }
        }

        imageID, err := p.config.ImageStore.Put(ctx, configJSON)
        if err != nil {
                return "", err
        }

        return imageID, nil
}

func (p *puller) pullSchema2(ctx context.Context, ref reference.Named, mfst *schema2.DeserializedManifest, platform *ocispec.Platform) (id digest.Digest, manifestDigest digest.Digest, _ error) {
        manifestDigest, err := schema2ManifestDigest(ref, mfst)
        if err != nil {
                return "", "", err
        }
        id, err = p.pullSchema2Layers(ctx, mfst.Target(), mfst.Layers, platform)
        return id, manifestDigest, err
}

func (p *puller) pullOCI(ctx context.Context, ref reference.Named, mfst *ocischema.DeserializedManifest, platform *ocispec.Platform) (id digest.Digest, manifestDigest digest.Digest, _ error) {
        manifestDigest, err := schema2ManifestDigest(ref, mfst)
        if err != nil {
                return "", "", err
        }
        id, err = p.pullSchema2Layers(ctx, mfst.Target(), mfst.Layers, platform)
        return id, manifestDigest, err
}

func receiveConfig(configChan <-chan []byte, errChan <-chan error) ([]byte, *image.RootFS, *ocispec.Platform, error) {
        select {
        case configJSON := <-configChan:
                rootfs, err := rootFSFromConfig(configJSON)
                if err != nil {
                        return nil, nil, nil, err
                }
                platform, err := platformFromConfig(configJSON)
                if err != nil {
                        return nil, nil, nil, err
                }
                return configJSON, rootfs, platform, nil
        case err := <-errChan:
                return nil, nil, nil, err
                // Don't need a case for ctx.Done in the select because cancellation
                // will trigger an error in p.pullSchema2ImageConfig.
        }
}

// pullManifestList handles "manifest lists" which point to various
// platform-specific manifests.
func (p *puller) pullManifestList(ctx context.Context, ref reference.Named, mfstList *manifestlist.DeserializedManifestList, pp *ocispec.Platform) (id digest.Digest, manifestListDigest digest.Digest, _ error) {
        manifestListDigest, err := schema2ManifestDigest(ref, mfstList)
        if err != nil {
                return "", "", err
        }

        var platform ocispec.Platform
        if pp != nil {
                platform = *pp
        }
        log.G(ctx).Debugf("%s resolved to a manifestList object with %d entries; looking for a %s match", ref, len(mfstList.Manifests), platforms.FormatAll(platform))

        manifestMatches := filterManifests(mfstList.Manifests, platform)

        for _, match := range manifestMatches {
                if err := checkImageCompatibility(match.Platform.OS, match.Platform.OSVersion); err != nil {
                        return "", "", err
                }

                manifest, err := p.manifestStore.Get(ctx, ocispec.Descriptor{
                        Digest:    match.Digest,
                        Size:      match.Size,
                        MediaType: match.MediaType,
                }, ref)
                if err != nil {
                        return "", "", err
                }

                manifestRef, err := reference.WithDigest(reference.TrimNamed(ref), match.Digest)
                if err != nil {
                        return "", "", err
                }

                switch v := manifest.(type) {
                case *schema2.DeserializedManifest:
                        id, _, err = p.pullSchema2(ctx, manifestRef, v, toOCIPlatform(match.Platform))
                        if err != nil {
                                return "", "", err
                        }
                case *ocischema.DeserializedManifest:
                        id, _, err = p.pullOCI(ctx, manifestRef, v, toOCIPlatform(match.Platform))
                        if err != nil {
                                return "", "", err
                        }
                case *manifestlist.DeserializedManifestList:
                        id, _, err = p.pullManifestList(ctx, manifestRef, v, pp)
                        if err != nil {
                                var noMatches noMatchesErr
                                if !errors.As(err, &noMatches) {
                                        // test the next match
                                        continue
                                }
                        }
                default:
                        mediaType, _, _ := manifest.Payload()
                        switch mediaType {
                        case MediaTypeDockerSchema1Manifest, MediaTypeDockerSchema1SignedManifest:
                                return "", "", DeprecatedSchema1ImageError(ref)
                        }

                        // OCI spec requires to skip unknown manifest types
                        continue
                }
                return id, manifestListDigest, err
        }
        return "", "", noMatchesErr{platform: platform}
}

const (
        defaultSchemaPullBackoff     = 250 * time.Millisecond
        defaultMaxSchemaPullAttempts = 5
)

func (p *puller) pullSchema2Config(ctx context.Context, dgst digest.Digest) (configJSON []byte, _ error) {
        blobs := p.repo.Blobs(ctx)
        err := retry(ctx, defaultMaxSchemaPullAttempts, defaultSchemaPullBackoff, func(ctx context.Context) (err error) {
                configJSON, err = blobs.Get(ctx, dgst)
                return err
        })
        if err != nil {
                return nil, err
        }

        // Verify image config digest
        verifier := dgst.Verifier()
        if _, err := verifier.Write(configJSON); err != nil {
                return nil, err
        }
        if !verifier.Verified() {
                err := fmt.Errorf("image config verification failed for digest %s", dgst)
                log.G(ctx).Error(err)
                return nil, err
        }

        return configJSON, nil
}

type noMatchesErr struct {
        platform ocispec.Platform
}

func (e noMatchesErr) Error() string {
        var p string
        if e.platform.OS == "" {
                p = platforms.FormatAll(platforms.DefaultSpec())
        } else {
                p = platforms.FormatAll(e.platform)
        }
        return fmt.Sprintf("no matching manifest for %s in the manifest list entries", p)
}

func retry(ctx context.Context, maxAttempts int, sleep time.Duration, f func(ctx context.Context) error) error {
        attempt := 0
        var err error
        for ; attempt < maxAttempts; attempt++ {
                err = retryOnError(f(ctx))
                if err == nil {
                        break
                }
                if xfer.IsDoNotRetryError(err) {
                        break
                }

                if attempt+1 < maxAttempts {
                        timer := time.NewTimer(sleep)
                        select {
                        case <-ctx.Done():
                                timer.Stop()
                                return ctx.Err()
                        case <-timer.C:
                                log.G(ctx).WithError(err).WithField("attempts", attempt+1).Debug("retrying after error")
                                sleep *= 2
                        }
                }
        }
        if err != nil {
                return errors.Wrapf(err, "download failed after attempts=%d", attempt+1)
        }
        return nil
}

// schema2ManifestDigest computes the manifest digest, and, if pulling by
// digest, ensures that it matches the requested digest.
func schema2ManifestDigest(ref reference.Named, mfst distribution.Manifest) (digest.Digest, error) {
        _, canonical, err := mfst.Payload()
        if err != nil {
                return "", err
        }

        // If pull by digest, then verify the manifest digest.
        if digested, isDigested := ref.(reference.Canonical); isDigested {
                verifier := digested.Digest().Verifier()
                if _, err := verifier.Write(canonical); err != nil {
                        return "", err
                }
                if !verifier.Verified() {
                        err := fmt.Errorf("manifest verification failed for digest %s", digested.Digest())
                        log.G(context.TODO()).Error(err)
                        return "", err
                }
                return digested.Digest(), nil
        }

        return digest.FromBytes(canonical), nil
}

func createDownloadFile() (*os.File, error) {
        return os.CreateTemp("", "GetImageBlob")
}

func toOCIPlatform(p manifestlist.PlatformSpec) *ocispec.Platform {
        // distribution pkg does define platform as pointer so this hack for empty struct
        // is necessary. This is temporary until correct OCI image-spec package is used.
        if p.OS == "" && p.Architecture == "" && p.Variant == "" && p.OSVersion == "" && p.OSFeatures == nil && p.Features == nil {
                return nil
        }
        return &ocispec.Platform{
                OS:           p.OS,
                Architecture: p.Architecture,
                Variant:      p.Variant,
                OSFeatures:   p.OSFeatures,
                OSVersion:    p.OSVersion,
        }
}

// maximumSpec returns the distribution platform with maximum compatibility for the current node.
func maximumSpec() ocispec.Platform {
        p := platforms.DefaultSpec()
        if p.Architecture == "amd64" {
                p.Variant = archvariant.AMD64Variant()
        }
        return p
}

//go:build !windows

package distribution

import (
        "context"
        "sort"

        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/docker/distribution"
        "github.com/docker/distribution/manifest/manifestlist"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func (ld *layerDescriptor) open(ctx context.Context) (distribution.ReadSeekCloser, error) {
        blobs := ld.repo.Blobs(ctx)
        return blobs.Open(ctx, ld.digest)
}

func filterManifests(manifests []manifestlist.ManifestDescriptor, p ocispec.Platform) []manifestlist.ManifestDescriptor {
        p = platforms.Normalize(withDefault(p))
        m := platforms.Only(p)
        var matches []manifestlist.ManifestDescriptor
        for _, desc := range manifests {
                descP := toOCIPlatform(desc.Platform)
                if descP == nil || m.Match(*descP) {
                        matches = append(matches, desc)
                        if descP != nil {
                                log.G(context.TODO()).Debugf("found match for %s with media type %s, digest %s", platforms.FormatAll(p), desc.MediaType, desc.Digest.String())
                        }
                }
        }

        sort.SliceStable(matches, func(i, j int) bool {
                p1 := toOCIPlatform(matches[i].Platform)
                if p1 == nil {
                        return false
                }
                p2 := toOCIPlatform(matches[j].Platform)
                if p2 == nil {
                        return true
                }
                return m.Less(*p1, *p2)
        })

        return matches
}

// checkImageCompatibility is a Windows-specific function. No-op on Linux
func checkImageCompatibility(imageOS, imageOSVersion string) error {
        return nil
}

func withDefault(p ocispec.Platform) ocispec.Platform {
        def := maximumSpec()
        if p.OS == "" {
                p.OS = def.OS
        }
        if p.Architecture == "" {
                p.Architecture = def.Architecture
                p.Variant = def.Variant
        }
        return p
}

package distribution

import (
        "bufio"
        "compress/gzip"
        "context"
        "errors"
        "fmt"
        "io"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/docker/pkg/progress"
        "github.com/moby/moby/api/types/events"
)

const compressionBufSize = 32768

// Push initiates a push operation on ref. ref is the specific variant of the
// image to push. If no tag is provided, all tags are pushed.
func Push(ctx context.Context, ref reference.Named, config *ImagePushConfig) error {
        // FIXME: Allow to interrupt current push when new push of same image is done.

        repoName := reference.TrimNamed(ref)

        endpoints, err := config.RegistryService.LookupPushEndpoints(reference.Domain(repoName))
        if err != nil {
                return err
        }

        progress.Messagef(config.ProgressOutput, "", "The push refers to repository [%s]", repoName.Name())

        associations := config.ReferenceStore.ReferencesByName(repoName)
        if len(associations) == 0 {
                return fmt.Errorf("An image does not exist locally with the tag: %s", reference.FamiliarName(repoName))
        }

        var (
                lastErr error

                // confirmedTLSRegistries is a map indicating which registries
                // are known to be using TLS. There should never be a plaintext
                // retry for any of these.
                confirmedTLSRegistries = make(map[string]struct{})
        )

        for _, endpoint := range endpoints {
                if endpoint.URL.Scheme != "https" {
                        if _, confirmedTLS := confirmedTLSRegistries[endpoint.URL.Host]; confirmedTLS {
                                log.G(ctx).Debugf("Skipping non-TLS endpoint %s for host/port that appears to use TLS", endpoint.URL)
                                continue
                        }
                }

                log.G(ctx).Debugf("Trying to push %s to %s", repoName.Name(), endpoint.URL)

                if err := newPusher(ref, endpoint, repoName, config).push(ctx); err != nil {
                        // Was this push cancelled? If so, don't try to fall
                        // back.
                        select {
                        case <-ctx.Done():
                        default:
                                if fallbackErr, ok := err.(fallbackError); ok {
                                        if fallbackErr.transportOK && endpoint.URL.Scheme == "https" {
                                                confirmedTLSRegistries[endpoint.URL.Host] = struct{}{}
                                        }
                                        err = fallbackErr.err
                                        lastErr = err
                                        log.G(ctx).Infof("Attempting next endpoint for push after error: %v", err)
                                        continue
                                }
                        }

                        // FIXME(thaJeztah): cleanup error and context handling in this package, as it's really messy.
                        if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                                log.G(ctx).WithError(err).Info("Not continuing with push after error")
                        } else {
                                log.G(ctx).WithError(err).Error("Not continuing with push after error")
                        }
                        return err
                }

                config.ImageEventLogger(ctx, reference.FamiliarString(ref), reference.FamiliarName(repoName), events.ActionPush)
                return nil
        }

        if lastErr == nil {
                lastErr = fmt.Errorf("no endpoints found for %s", repoName.Name())
        }
        return lastErr
}

// compress returns an io.ReadCloser which will supply a compressed version of
// the provided Reader. The caller must close the ReadCloser after reading the
// compressed data.
//
// Note that this function returns a reader instead of taking a writer as an
// argument so that it can be used with httpBlobWriter's ReadFrom method.
// Using httpBlobWriter's Write method would send a PATCH request for every
// Write call.
//
// The second return value is a channel that gets closed when the goroutine
// is finished. This allows the caller to make sure the goroutine finishes
// before it releases any resources connected with the reader that was
// passed in.
func compress(in io.Reader) (io.ReadCloser, chan struct{}) {
        compressionDone := make(chan struct{})

        pipeReader, pipeWriter := io.Pipe()
        // Use a bufio.Writer to avoid excessive chunking in HTTP request.
        bufWriter := bufio.NewWriterSize(pipeWriter, compressionBufSize)
        compressor := gzip.NewWriter(bufWriter)

        go func() {
                _, err := io.Copy(compressor, in)
                if err == nil {
                        err = compressor.Close()
                }
                if err == nil {
                        err = bufWriter.Flush()
                }
                if err != nil {
                        pipeWriter.CloseWithError(err)
                } else {
                        pipeWriter.Close()
                }
                close(compressionDone)
        }()

        return pipeReader, compressionDone
}

package distribution

import (
        "context"
        "fmt"
        "io"
        "sort"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/distribution/manifest/schema2"
        "github.com/docker/distribution/registry/api/errcode"
        "github.com/docker/distribution/registry/client"
        "github.com/docker/docker/distribution/metadata"
        "github.com/docker/docker/distribution/xfer"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/stringid"
        "github.com/docker/docker/registry"
        apitypes "github.com/moby/moby/api/types"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

const (
        smallLayerMaximumSize  = 100 * (1 << 10) // 100KB
        middleLayerMaximumSize = 10 * (1 << 20)  // 10MB
)

// newPusher creates a new pusher for pushing to a v2 registry.
// The parameters are passed through to the underlying pusher implementation for
// use during the actual push operation.
func newPusher(ref reference.Named, endpoint registry.APIEndpoint, repoName reference.Named, config *ImagePushConfig) *pusher {
        return &pusher{
                metadataService: metadata.NewV2MetadataService(config.MetadataStore),
                ref:             ref,
                endpoint:        endpoint,
                repoName:        repoName,
                config:          config,
        }
}

type pusher struct {
        metadataService metadata.V2MetadataService
        ref             reference.Named
        endpoint        registry.APIEndpoint
        repoName        reference.Named
        config          *ImagePushConfig
        repo            distribution.Repository

        // pushState is state built by the Upload functions.
        pushState pushState
}

type pushState struct {
        sync.Mutex
        // remoteLayers is the set of layers known to exist on the remote side.
        // This avoids redundant queries when pushing multiple tags that
        // involve the same layers. It is also used to fill in digest and size
        // information when building the manifest.
        remoteLayers map[layer.DiffID]distribution.Descriptor
        hasAuthInfo  bool
}

// TODO(tiborvass): have push() take a reference to repository + tag, so that the pusher itself is repository-agnostic.
func (p *pusher) push(ctx context.Context) (err error) {
        p.pushState.remoteLayers = make(map[layer.DiffID]distribution.Descriptor)

        p.repo, err = newRepository(ctx, p.repoName, p.endpoint, p.config.MetaHeaders, p.config.AuthConfig, "push", "pull")
        p.pushState.hasAuthInfo = p.config.AuthConfig.RegistryToken != "" || (p.config.AuthConfig.Username != "" && p.config.AuthConfig.Password != "")
        if err != nil {
                log.G(ctx).Debugf("Error getting v2 registry: %v", err)
                return err
        }

        if err = p.pushRepository(ctx); err != nil {
                // [Service.LookupPushEndpoints] never returns mirror endpoint.
                if continueOnError(err, false) {
                        return fallbackError{
                                err:         err,
                                transportOK: true,
                        }
                }
        }
        return err
}

func (p *pusher) pushRepository(ctx context.Context) (err error) {
        if namedTagged, isNamedTagged := p.ref.(reference.NamedTagged); isNamedTagged {
                imageID, err := p.config.ReferenceStore.Get(p.ref)
                if err != nil {
                        return fmt.Errorf("tag does not exist: %s", reference.FamiliarString(p.ref))
                }

                return p.pushTag(ctx, namedTagged, imageID)
        }

        if !reference.IsNameOnly(p.ref) {
                return errors.New("cannot push a digest reference")
        }

        // Push all tags
        pushed := 0
        for _, association := range p.config.ReferenceStore.ReferencesByName(p.ref) {
                if namedTagged, isNamedTagged := association.Ref.(reference.NamedTagged); isNamedTagged {
                        pushed++
                        if err := p.pushTag(ctx, namedTagged, association.ID); err != nil {
                                return err
                        }
                }
        }

        if pushed == 0 {
                return fmt.Errorf("no tags to push for %s", reference.FamiliarName(p.repoName))
        }

        return nil
}

func (p *pusher) pushTag(ctx context.Context, ref reference.NamedTagged, id digest.Digest) error {
        log.G(ctx).Debugf("Pushing repository: %s", reference.FamiliarString(ref))

        imgConfig, err := p.config.ImageStore.Get(ctx, id)
        if err != nil {
                return fmt.Errorf("could not find image from tag %s: %v", reference.FamiliarString(ref), err)
        }

        rootfs, err := rootFSFromConfig(imgConfig)
        if err != nil {
                return fmt.Errorf("unable to get rootfs for image %s: %s", reference.FamiliarString(ref), err)
        }

        l, err := p.config.LayerStores.Get(rootfs.ChainID())
        if err != nil {
                return fmt.Errorf("failed to get top layer from image: %v", err)
        }
        defer l.Release()

        hmacKey, err := metadata.ComputeV2MetadataHMACKey(p.config.AuthConfig)
        if err != nil {
                return fmt.Errorf("failed to compute hmac key of auth config: %v", err)
        }

        var descriptors []xfer.UploadDescriptor

        descriptorTemplate := pushDescriptor{
                metadataService: p.metadataService,
                hmacKey:         hmacKey,
                repoName:        p.repoName,
                ref:             p.ref,
                repo:            p.repo,
                pushState:       &p.pushState,
        }

        // Loop bounds condition is to avoid pushing the base layer on Windows.
        for range rootfs.DiffIDs {
                descriptor := descriptorTemplate
                descriptor.layer = l
                descriptor.checkedDigests = make(map[digest.Digest]struct{})
                descriptors = append(descriptors, &descriptor)

                l = l.Parent()
        }

        if err := p.config.UploadManager.Upload(ctx, descriptors, p.config.ProgressOutput); err != nil {
                return err
        }

        // Try schema2 first
        builder := schema2.NewManifestBuilder(p.repo.Blobs(ctx), p.config.ConfigMediaType, imgConfig)
        manifest, err := manifestFromBuilder(ctx, builder, descriptors)
        if err != nil {
                return err
        }

        manSvc, err := p.repo.Manifests(ctx)
        if err != nil {
                return err
        }

        putOptions := []distribution.ManifestServiceOption{distribution.WithTag(ref.Tag())}
        if _, err = manSvc.Put(ctx, manifest, putOptions...); err != nil {
                if err.Error() == "tag invalid" {
                        msg := "[DEPRECATED] support for pushing manifest v2 schema1 images has been removed. More information at https://docs.docker.com/registry/spec/deprecated-schema-v1/"
                        log.G(ctx).WithError(err).Error(msg)
                        err = errors.Wrap(err, msg)
                }
                return err
        }

        var canonicalManifest []byte

        switch v := manifest.(type) {
        case *schema2.DeserializedManifest:
                _, canonicalManifest, err = v.Payload()
                if err != nil {
                        return err
                }
        default:
                return fmt.Errorf("unknown manifest type %T", v)
        }

        manifestDigest := digest.FromBytes(canonicalManifest)
        progress.Messagef(p.config.ProgressOutput, "", "%s: digest: %s size: %d", ref.Tag(), manifestDigest, len(canonicalManifest))

        if err := addDigestReference(p.config.ReferenceStore, ref, manifestDigest, id); err != nil {
                return err
        }

        // Signal digest to the trust client so it can sign the
        // push, if appropriate.
        progress.Aux(p.config.ProgressOutput, apitypes.PushResult{Tag: ref.Tag(), Digest: manifestDigest.String(), Size: len(canonicalManifest)})

        return nil
}

func manifestFromBuilder(ctx context.Context, builder distribution.ManifestBuilder, descriptors []xfer.UploadDescriptor) (distribution.Manifest, error) {
        // descriptors is in reverse order; iterate backwards to get references
        // appended in the right order.
        for i := len(descriptors) - 1; i >= 0; i-- {
                if err := builder.AppendReference(descriptors[i].(*pushDescriptor)); err != nil {
                        return nil, err
                }
        }

        return builder.Build(ctx)
}

type pushDescriptor struct {
        layer            PushLayer
        metadataService  metadata.V2MetadataService
        hmacKey          []byte
        repoName         reference.Named
        ref              reference.Named
        repo             distribution.Repository
        pushState        *pushState
        remoteDescriptor distribution.Descriptor
        // a set of digests whose presence has been checked in a target repository
        checkedDigests map[digest.Digest]struct{}
}

func (pd *pushDescriptor) Key() string {
        return "v2push:" + pd.ref.Name() + " " + pd.layer.DiffID().String()
}

func (pd *pushDescriptor) ID() string {
        return stringid.TruncateID(pd.layer.DiffID().String())
}

func (pd *pushDescriptor) DiffID() layer.DiffID {
        return pd.layer.DiffID()
}

func (pd *pushDescriptor) Upload(ctx context.Context, progressOutput progress.Output) (distribution.Descriptor, error) {
        diffID := pd.DiffID()

        pd.pushState.Lock()
        if descriptor, ok := pd.pushState.remoteLayers[diffID]; ok {
                // it is already known that the push is not needed and
                // therefore doing a stat is unnecessary
                pd.pushState.Unlock()
                progress.Update(progressOutput, pd.ID(), "Layer already exists")
                return descriptor, nil
        }
        pd.pushState.Unlock()

        maxMountAttempts, maxExistenceChecks, checkOtherRepositories := getMaxMountAndExistenceCheckAttempts(pd.layer)

        // Do we have any metadata associated with this layer's DiffID?
        metaData, err := pd.metadataService.GetMetadata(diffID)
        if err == nil {
                // check for blob existence in the target repository
                descriptor, exists, err := pd.layerAlreadyExists(ctx, progressOutput, diffID, true, 1, metaData)
                if exists || err != nil {
                        return descriptor, err
                }
        }

        // if digest was empty or not saved, or if blob does not exist on the remote repository,
        // then push the blob.
        bs := pd.repo.Blobs(ctx)

        var layerUpload distribution.BlobWriter

        // Attempt to find another repository in the same registry to mount the layer from to avoid an unnecessary upload
        candidates := getRepositoryMountCandidates(pd.repoName, pd.hmacKey, maxMountAttempts, metaData)
        isUnauthorizedError := false
        for _, mc := range candidates {
                mountCandidate := mc
                log.G(ctx).Debugf("attempting to mount layer %s (%s) from %s", diffID, mountCandidate.Digest, mountCandidate.SourceRepository)
                createOpts := []distribution.BlobCreateOption{}

                if mountCandidate.SourceRepository != "" {
                        namedRef, err := reference.ParseNormalizedNamed(mountCandidate.SourceRepository)
                        if err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to parse source repository reference %v", reference.FamiliarString(namedRef))
                                _ = pd.metadataService.Remove(mountCandidate)
                                continue
                        }

                        // Candidates are always under same domain, create remote reference
                        // with only path to set mount from with
                        remoteRef, err := reference.WithName(reference.Path(namedRef))
                        if err != nil {
                                log.G(ctx).WithError(err).Errorf("failed to make remote reference out of %q", reference.Path(namedRef))
                                continue
                        }

                        canonicalRef, err := reference.WithDigest(reference.TrimNamed(remoteRef), mountCandidate.Digest)
                        if err != nil {
                                log.G(ctx).WithError(err).Error("failed to make canonical reference")
                                continue
                        }

                        createOpts = append(createOpts, client.WithMountFrom(canonicalRef))
                }

                // send the layer
                lu, err := bs.Create(ctx, createOpts...)
                switch err := err.(type) {
                case nil:
                        // noop
                case distribution.ErrBlobMounted:
                        progress.Updatef(progressOutput, pd.ID(), "Mounted from %s", err.From.Name())

                        err.Descriptor.MediaType = schema2.MediaTypeLayer

                        pd.pushState.Lock()
                        pd.pushState.remoteLayers[diffID] = err.Descriptor
                        pd.pushState.Unlock()

                        // Cache mapping from this layer's DiffID to the blobsum
                        if err := pd.metadataService.TagAndAdd(diffID, pd.hmacKey, metadata.V2Metadata{
                                Digest:           err.Descriptor.Digest,
                                SourceRepository: pd.repoName.Name(),
                        }); err != nil {
                                return distribution.Descriptor{}, xfer.DoNotRetry{Err: err}
                        }
                        return err.Descriptor, nil
                case errcode.Errors:
                        for _, e := range err {
                                switch e := e.(type) {
                                case errcode.Error:
                                        if e.Code == errcode.ErrorCodeUnauthorized {
                                                // when unauthorized error that indicate user don't has right to push layer to register
                                                log.G(ctx).Debugln("failed to push layer to registry because unauthorized error")
                                                isUnauthorizedError = true
                                        }
                                default:
                                }
                        }
                default:
                        log.G(ctx).Infof("failed to mount layer %s (%s) from %s: %v", diffID, mountCandidate.Digest, mountCandidate.SourceRepository, err)
                }

                // when error is unauthorizedError and user don't hasAuthInfo that's the case user don't has right to push layer to register
                // and he hasn't login either, in this case candidate cache should be removed
                if mountCandidate.SourceRepository != "" &&
                        (!isUnauthorizedError || pd.pushState.hasAuthInfo) &&
                        (metadata.CheckV2MetadataHMAC(&mountCandidate, pd.hmacKey) ||
                                mountCandidate.HMAC == "") {
                        cause := "blob mount failure"
                        if err != nil {
                                cause = fmt.Sprintf("an error: %v", err.Error())
                        }
                        log.G(ctx).Debugf("removing association between layer %s and %s due to %s", mountCandidate.Digest, mountCandidate.SourceRepository, cause)
                        _ = pd.metadataService.Remove(mountCandidate)
                }

                if lu != nil {
                        // cancel previous upload
                        cancelLayerUpload(ctx, mountCandidate.Digest, layerUpload)
                        layerUpload = lu
                }
        }

        if maxExistenceChecks-len(pd.checkedDigests) > 0 {
                // do additional layer existence checks with other known digests if any
                descriptor, exists, err := pd.layerAlreadyExists(ctx, progressOutput, diffID, checkOtherRepositories, maxExistenceChecks-len(pd.checkedDigests), metaData)
                if exists || err != nil {
                        return descriptor, err
                }
        }

        log.G(ctx).Debugf("Pushing layer: %s", diffID)
        if layerUpload == nil {
                layerUpload, err = bs.Create(ctx)
                if err != nil {
                        return distribution.Descriptor{}, retryOnError(err)
                }
        }
        defer layerUpload.Close()
        // upload the blob
        return pd.uploadUsingSession(ctx, progressOutput, diffID, layerUpload)
}

func (pd *pushDescriptor) SetRemoteDescriptor(descriptor distribution.Descriptor) {
        pd.remoteDescriptor = descriptor
}

func (pd *pushDescriptor) Descriptor() distribution.Descriptor {
        return pd.remoteDescriptor
}

func (pd *pushDescriptor) uploadUsingSession(
        ctx context.Context,
        progressOutput progress.Output,
        diffID layer.DiffID,
        layerUpload distribution.BlobWriter,
) (distribution.Descriptor, error) {
        var reader io.ReadCloser

        contentReader, err := pd.layer.Open()
        if err != nil {
                return distribution.Descriptor{}, retryOnError(err)
        }

        reader = progress.NewProgressReader(ioutils.NewCancelReadCloser(ctx, contentReader), progressOutput, pd.layer.Size(), pd.ID(), "Pushing")

        switch m := pd.layer.MediaType(); m {
        case schema2.MediaTypeUncompressedLayer:
                compressedReader, compressionDone := compress(reader)
                defer func(closer io.Closer) {
                        closer.Close()
                        <-compressionDone
                }(reader)
                reader = compressedReader
        case schema2.MediaTypeLayer:
        default:
                reader.Close()
                return distribution.Descriptor{}, xfer.DoNotRetry{Err: fmt.Errorf("unsupported layer media type %s", m)}
        }

        digester := digest.Canonical.Digester()
        tee := io.TeeReader(reader, digester.Hash())

        nn, err := layerUpload.ReadFrom(tee)
        reader.Close()
        if err != nil {
                return distribution.Descriptor{}, retryOnError(err)
        }

        pushDigest := digester.Digest()
        if _, err := layerUpload.Commit(ctx, distribution.Descriptor{Digest: pushDigest}); err != nil {
                return distribution.Descriptor{}, retryOnError(err)
        }

        log.G(ctx).Debugf("uploaded layer %s (%s), %d bytes", diffID, pushDigest, nn)
        progress.Update(progressOutput, pd.ID(), "Pushed")

        // Cache mapping from this layer's DiffID to the blobsum
        if err := pd.metadataService.TagAndAdd(diffID, pd.hmacKey, metadata.V2Metadata{
                Digest:           pushDigest,
                SourceRepository: pd.repoName.Name(),
        }); err != nil {
                return distribution.Descriptor{}, xfer.DoNotRetry{Err: err}
        }

        desc := distribution.Descriptor{
                Digest:    pushDigest,
                MediaType: schema2.MediaTypeLayer,
                Size:      nn,
        }

        pd.pushState.Lock()
        pd.pushState.remoteLayers[diffID] = desc
        pd.pushState.Unlock()

        return desc, nil
}

// layerAlreadyExists checks if the registry already knows about any of the metadata passed in the "metadata"
// slice. If it finds one that the registry knows about, it returns the known digest and "true". If
// "checkOtherRepositories" is true, stat will be performed also with digests mapped to any other repository
// (not just the target one).
func (pd *pushDescriptor) layerAlreadyExists(
        ctx context.Context,
        progressOutput progress.Output,
        diffID layer.DiffID,
        checkOtherRepositories bool,
        maxExistenceCheckAttempts int,
        v2Metadata []metadata.V2Metadata,
) (_ distribution.Descriptor, exists bool, _ error) {
        // filter the metadata
        candidates := []metadata.V2Metadata{}
        for _, meta := range v2Metadata {
                if meta.SourceRepository != "" && !checkOtherRepositories && meta.SourceRepository != pd.repoName.Name() {
                        continue
                }
                candidates = append(candidates, meta)
        }
        // sort the candidates by similarity
        sortV2MetadataByLikenessAndAge(pd.repoName, pd.hmacKey, candidates)

        digestToMetadata := make(map[digest.Digest]*metadata.V2Metadata)
        // an array of unique blob digests ordered from the best mount candidates to worst
        layerDigests := []digest.Digest{}
        for i := 0; i < len(candidates); i++ {
                if len(layerDigests) >= maxExistenceCheckAttempts {
                        break
                }
                meta := &candidates[i]
                if _, ok := digestToMetadata[meta.Digest]; ok {
                        // keep reference just to the first mapping (the best mount candidate)
                        continue
                }
                if _, ok := pd.checkedDigests[meta.Digest]; ok {
                        // existence of this digest has already been tested
                        continue
                }
                digestToMetadata[meta.Digest] = meta
                layerDigests = append(layerDigests, meta.Digest)
        }

        var desc distribution.Descriptor

attempts:
        for _, dgst := range layerDigests {
                meta := digestToMetadata[dgst]
                log.G(ctx).Debugf("Checking for presence of layer %s (%s) in %s", diffID, dgst, pd.repoName.Name())
                var err error
                desc, err = pd.repo.Blobs(ctx).Stat(ctx, dgst)
                pd.checkedDigests[meta.Digest] = struct{}{}
                switch {
                case err == nil:
                        if m, ok := digestToMetadata[desc.Digest]; !ok || m.SourceRepository != pd.repoName.Name() || !metadata.CheckV2MetadataHMAC(m, pd.hmacKey) {
                                // cache mapping from this layer's DiffID to the blobsum
                                if err := pd.metadataService.TagAndAdd(diffID, pd.hmacKey, metadata.V2Metadata{
                                        Digest:           desc.Digest,
                                        SourceRepository: pd.repoName.Name(),
                                }); err != nil {
                                        return distribution.Descriptor{}, false, xfer.DoNotRetry{Err: err}
                                }
                        }
                        desc.MediaType = schema2.MediaTypeLayer
                        exists = true
                        break attempts
                case errors.Is(err, distribution.ErrBlobUnknown):
                        if meta.SourceRepository == pd.repoName.Name() {
                                // remove the mapping to the target repository
                                if err := pd.metadataService.Remove(*meta); err != nil {
                                        log.G(ctx).WithError(err).Debug("Failed remove metadata")
                                }
                        }
                default:
                        log.G(ctx).WithError(err).Debugf("Failed to check for presence of layer %s (%s) in %s", diffID, dgst, pd.repoName.Name())
                }
        }

        if exists {
                progress.Update(progressOutput, pd.ID(), "Layer already exists")
                pd.pushState.Lock()
                pd.pushState.remoteLayers[diffID] = desc
                pd.pushState.Unlock()
        }

        return desc, exists, nil
}

// getMaxMountAndExistenceCheckAttempts returns a maximum number of cross repository mount attempts from
// source repositories of target registry, maximum number of layer existence checks performed on the target
// repository and whether the check shall be done also with digests mapped to different repositories. The
// decision is based on layer size. The smaller the layer, the fewer attempts shall be made because the cost
// of upload does not outweigh a latency.
func getMaxMountAndExistenceCheckAttempts(layer PushLayer) (maxMountAttempts, maxExistenceCheckAttempts int, checkOtherRepositories bool) {
        size := layer.Size()
        switch {
        // big blob
        case size > middleLayerMaximumSize:
                // 1st attempt to mount the blob few times
                // 2nd few existence checks with digests associated to any repository
                // then fallback to upload
                return 4, 3, true

        // middle sized blobs; if we could not get the size, assume we deal with middle sized blob
        case size > smallLayerMaximumSize:
                // 1st attempt to mount blobs of average size few times
                // 2nd try at most 1 existence check if there's an existing mapping to the target repository
                // then fallback to upload
                return 3, 1, false

        // small blobs, do a minimum number of checks
        default:
                return 1, 1, false
        }
}

// getRepositoryMountCandidates returns an array of v2 metadata items belonging to the given registry. The
// array is sorted from youngest to oldest. The resulting array will contain only metadata entries having
// registry part of SourceRepository matching the part of repoInfo.
func getRepositoryMountCandidates(
        repoInfo reference.Named,
        hmacKey []byte,
        maxCandidates int,
        v2Metadata []metadata.V2Metadata,
) []metadata.V2Metadata {
        candidates := []metadata.V2Metadata{}
        for _, meta := range v2Metadata {
                sourceRepo, err := reference.ParseNamed(meta.SourceRepository)
                if err != nil || reference.Domain(repoInfo) != reference.Domain(sourceRepo) {
                        continue
                }
                // target repository is not a viable candidate
                if meta.SourceRepository == repoInfo.Name() {
                        continue
                }
                candidates = append(candidates, meta)
        }

        sortV2MetadataByLikenessAndAge(repoInfo, hmacKey, candidates)
        if maxCandidates >= 0 && len(candidates) > maxCandidates {
                // select the youngest metadata
                candidates = candidates[:maxCandidates]
        }

        return candidates
}

// byLikeness is a sorting container for v2 metadata candidates for cross repository mount. The
// candidate "a" is preferred over "b":
//
//  1. if it was hashed using the same AuthConfig as the one used to authenticate to target repository and the
//     "b" was not
//  2. if a number of its repository path components exactly matching path components of target repository is higher
type byLikeness struct {
        arr            []metadata.V2Metadata
        hmacKey        []byte
        pathComponents []string
}

func (bla byLikeness) Less(i, j int) bool {
        aMacMatch := metadata.CheckV2MetadataHMAC(&bla.arr[i], bla.hmacKey)
        bMacMatch := metadata.CheckV2MetadataHMAC(&bla.arr[j], bla.hmacKey)
        if aMacMatch != bMacMatch {
                return aMacMatch
        }
        aMatch := numOfMatchingPathComponents(bla.arr[i].SourceRepository, bla.pathComponents)
        bMatch := numOfMatchingPathComponents(bla.arr[j].SourceRepository, bla.pathComponents)
        return aMatch > bMatch
}

func (bla byLikeness) Swap(i, j int) {
        bla.arr[i], bla.arr[j] = bla.arr[j], bla.arr[i]
}
func (bla byLikeness) Len() int { return len(bla.arr) }

func sortV2MetadataByLikenessAndAge(repoInfo reference.Named, hmacKey []byte, marr []metadata.V2Metadata) {
        // reverse the metadata array to shift the newest entries to the beginning
        for i := 0; i < len(marr)/2; i++ {
                marr[i], marr[len(marr)-i-1] = marr[len(marr)-i-1], marr[i]
        }
        // keep equal entries ordered from the youngest to the oldest
        sort.Stable(byLikeness{
                arr:            marr,
                hmacKey:        hmacKey,
                pathComponents: getPathComponents(repoInfo.Name()),
        })
}

// numOfMatchingPathComponents returns a number of path components in "pth" that exactly match "matchComponents".
func numOfMatchingPathComponents(pth string, matchComponents []string) int {
        pthComponents := getPathComponents(pth)
        i := 0
        for ; i < len(pthComponents) && i < len(matchComponents); i++ {
                if matchComponents[i] != pthComponents[i] {
                        return i
                }
        }
        return i
}

func getPathComponents(path string) []string {
        return strings.Split(path, "/")
}

func cancelLayerUpload(ctx context.Context, dgst digest.Digest, layerUpload distribution.BlobWriter) {
        if layerUpload != nil {
                log.G(ctx).Debugf("cancelling upload of blob %s", dgst)
                err := layerUpload.Cancel(ctx)
                if err != nil {
                        log.G(ctx).Warnf("failed to cancel upload: %v", err)
                }
        }
}

package distribution

import (
        "context"
        "fmt"
        "net"
        "net/http"
        "time"

        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/distribution/manifest/schema2"
        "github.com/docker/distribution/registry/client"
        "github.com/docker/distribution/registry/client/auth"
        "github.com/docker/docker/dockerversion"
        "github.com/docker/docker/registry"
        registrytypes "github.com/moby/moby/api/types/registry"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

var (
        // supportedMediaTypes represents acceptable media-type(-prefixes)
        // we use this list to prevent obscure errors when trying to pull
        // OCI artifacts.
        supportedMediaTypes = []string{
                // valid prefixes
                "application/vnd.oci.image",
                "application/vnd.docker",

                // these types may occur on old images, and are copied from
                // defaultImageTypes below.
                "application/octet-stream",
                "application/json",
                "text/html",
                "",
        }

        // defaultImageTypes represents the schema2 config types for images
        defaultImageTypes = []string{
                schema2.MediaTypeImageConfig,
                ocispec.MediaTypeImageConfig,
                // Handle unexpected values from https://github.com/docker/distribution/issues/1621
                // (see also https://github.com/docker/docker/issues/22378,
                // https://github.com/docker/docker/issues/30083)
                "application/octet-stream",
                "application/json",
                "text/html",
                // Treat defaulted values as images, newer types cannot be implied
                "",
        }

        // pluginTypes represents the schema2 config types for plugins
        pluginTypes = []string{
                schema2.MediaTypePluginConfig,
        }

        mediaTypeClasses map[string]string
)

func init() {
        // initialize media type classes with all know types for images and plugins.
        mediaTypeClasses = map[string]string{}
        for _, t := range defaultImageTypes {
                mediaTypeClasses[t] = "image"
        }
        for _, t := range pluginTypes {
                mediaTypeClasses[t] = "plugin"
        }
}

// newRepository returns a repository (v2 only). It creates an HTTP transport
// providing timeout settings and authentication support, and also verifies the
// remote API version.
func newRepository(
        ctx context.Context, ref reference.Named, endpoint registry.APIEndpoint,
        metaHeaders http.Header, authConfig *registrytypes.AuthConfig, actions ...string,
) (distribution.Repository, error) {
        // Trim the hostname to form the RemoteName
        repoName := reference.Path(ref)

        direct := &net.Dialer{
                Timeout:   30 * time.Second,
                KeepAlive: 30 * time.Second,
        }

        // TODO(dmcgowan): Call close idle connections when complete, use keep alive
        base := &http.Transport{
                Proxy:               http.ProxyFromEnvironment,
                DialContext:         direct.DialContext,
                TLSHandshakeTimeout: 10 * time.Second,
                TLSClientConfig:     endpoint.TLSConfig,
                // TODO(dmcgowan): Call close idle connections when complete and use keep alive
                DisableKeepAlives: true,
        }

        modifiers := registry.Headers(dockerversion.DockerUserAgent(ctx), metaHeaders)
        authTransport := newTransport(base, modifiers...)

        challengeManager, err := registry.PingV2Registry(endpoint.URL, authTransport)
        if err != nil {
                transportOK := false
                if responseErr, ok := err.(registry.PingResponseError); ok {
                        transportOK = true
                        err = responseErr.Err
                }
                return nil, fallbackError{
                        err:         err,
                        transportOK: transportOK,
                }
        }

        if authConfig.RegistryToken != "" {
                modifiers = append(modifiers, auth.NewAuthorizer(challengeManager, &passThruTokenHandler{token: authConfig.RegistryToken}))
        } else {
                creds := registry.NewStaticCredentialStore(authConfig)
                tokenHandler := auth.NewTokenHandlerWithOptions(auth.TokenHandlerOptions{
                        Transport:   authTransport,
                        Credentials: creds,
                        Scopes: []auth.Scope{auth.RepositoryScope{
                                Repository: repoName,
                                Actions:    actions,
                        }},
                        ClientID: registry.AuthClientID,
                })
                basicHandler := auth.NewBasicHandler(creds)
                modifiers = append(modifiers, auth.NewAuthorizer(challengeManager, tokenHandler, basicHandler))
        }

        tr := newTransport(base, modifiers...)

        // FIXME(thaJeztah): should this just take the original repoInfo.Name instead of converting the remote name back to a named reference?
        repoNameRef, err := reference.WithName(repoName)
        if err != nil {
                return nil, fallbackError{
                        err:         err,
                        transportOK: true,
                }
        }

        repo, err := client.NewRepository(repoNameRef, endpoint.URL.String(), tr)
        if err != nil {
                return nil, fallbackError{
                        err:         err,
                        transportOK: true,
                }
        }

        return repo, nil
}

type passThruTokenHandler struct {
        token string
}

func (th *passThruTokenHandler) Scheme() string {
        return "bearer"
}

func (th *passThruTokenHandler) AuthorizeRequest(req *http.Request, params map[string]string) error {
        req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", th.token))
        return nil
}

package distribution

import (
        "context"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/docker/errdefs"
)

// GetRepositories returns a list of repositories configured for the given
// reference. Multiple repositories can be returned if the reference is for
// the default (Docker Hub) registry and a mirror is configured, but it omits
// registries that were not reachable (pinging the /v2/ endpoint failed).
//
// It returns an error if it was unable to reach any of the registries for
// the given reference, or if the provided reference is invalid.
func GetRepositories(ctx context.Context, ref reference.Named, config *ImagePullConfig) ([]distribution.Repository, error) {
        repoName := reference.TrimNamed(ref)
        // makes sure name is not empty or `scratch`
        if err := validateRepoName(repoName); err != nil {
                return nil, errdefs.InvalidParameter(err)
        }

        endpoints, err := config.RegistryService.LookupPullEndpoints(reference.Domain(repoName))
        if err != nil {
                return nil, err
        }

        var (
                repositories []distribution.Repository
                lastError    error
        )
        for _, endpoint := range endpoints {
                repo, err := newRepository(ctx, repoName, endpoint, nil, config.AuthConfig, "pull")
                if err != nil {
                        log.G(ctx).WithFields(log.Fields{"endpoint": endpoint.URL.String(), "error": err}).Info("endpoint")
                        lastError = err
                        continue
                }
                repositories = append(repositories, repo)
        }
        if len(repositories) == 0 {
                return nil, lastError
        }
        return repositories, nil
}

package distribution

import (
        "net/http"

        "github.com/docker/distribution/registry/client/transport"
        "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
)

// newTransport creates a new transport which will apply modifiers to
// the request on a RoundTrip call.
func newTransport(base http.RoundTripper, modifiers ...transport.RequestModifier) http.RoundTripper {
        tr := transport.NewTransport(base, modifiers...)

        // Wrap the transport with OpenTelemetry instrumentation
        // This propagates the Traceparent header.
        return otelhttp.NewTransport(tr)
}

package utils

import (
        "context"
        "errors"
        "io"
        "syscall"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/streamformatter"
)

// WriteDistributionProgress is a helper for writing progress from chan to JSON
// stream with an optional cancel function.
func WriteDistributionProgress(cancelFunc func(), outStream io.Writer, progressChan <-chan progress.Progress) {
        progressOutput := streamformatter.NewJSONProgressOutput(outStream, false)
        operationCancelled := false

        for prog := range progressChan {
                if err := progressOutput.WriteProgress(prog); err != nil && !operationCancelled {
                        // don't log broken pipe errors as this is the normal case when a client aborts
                        if errors.Is(err, syscall.EPIPE) {
                                log.G(context.TODO()).Info("Pull session cancelled")
                        } else {
                                log.G(context.TODO()).Errorf("error writing progress to client: %v", err)
                        }
                        cancelFunc()
                        operationCancelled = true
                        // Don't return, because we need to continue draining
                        // progressChan until it's closed to avoid a deadlock.
                }
        }
}

package xfer

import (
        "context"
        "errors"
        "fmt"
        "io"
        "time"

        "github.com/containerd/log"
        "github.com/docker/distribution"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/docker/docker/pkg/progress"
        "github.com/moby/go-archive/compression"
)

const maxDownloadAttempts = 5

// LayerDownloadManager figures out which layers need to be downloaded, then
// registers and downloads those, taking into account dependencies between
// layers.
type LayerDownloadManager struct {
        layerStore          layer.Store
        tm                  *transferManager
        waitDuration        time.Duration
        maxDownloadAttempts int
}

// SetConcurrency sets the max concurrent downloads for each pull
func (ldm *LayerDownloadManager) SetConcurrency(concurrency int) {
        ldm.tm.setConcurrency(concurrency)
}

// NewLayerDownloadManager returns a new LayerDownloadManager.
func NewLayerDownloadManager(layerStore layer.Store, concurrencyLimit int, options ...DownloadOption) *LayerDownloadManager {
        manager := LayerDownloadManager{
                layerStore:          layerStore,
                tm:                  newTransferManager(concurrencyLimit),
                waitDuration:        time.Second,
                maxDownloadAttempts: maxDownloadAttempts,
        }
        for _, option := range options {
                option(&manager)
        }
        return &manager
}

// DownloadOption set options for the LayerDownloadManager.
type DownloadOption func(*LayerDownloadManager)

// WithMaxDownloadAttempts configures the maximum number of download
// attempts for a download manager.
func WithMaxDownloadAttempts(maxDownloadAttempts int) DownloadOption {
        return func(dlm *LayerDownloadManager) {
                dlm.maxDownloadAttempts = maxDownloadAttempts
        }
}

type downloadTransfer struct {
        transfer

        layerStore layer.Store
        layer      layer.Layer
        err        error
}

// result returns the layer resulting from the download, if the download
// and registration were successful.
func (d *downloadTransfer) result() (layer.Layer, error) {
        return d.layer, d.err
}

// A DownloadDescriptor references a layer that may need to be downloaded.
type DownloadDescriptor interface {
        // Key returns the key used to deduplicate downloads.
        Key() string
        // ID returns the ID for display purposes.
        ID() string
        // DiffID should return the DiffID for this layer, or an error
        // if it is unknown (for example, if it has not been downloaded
        // before).
        DiffID() (layer.DiffID, error)
        // Download is called to perform the download.
        Download(ctx context.Context, progressOutput progress.Output) (io.ReadCloser, int64, error)
        // Close is called when the download manager is finished with this
        // descriptor and will not call Download again or read from the reader
        // that Download returned.
        Close()
}

// DigestRegisterer can be implemented by a DownloadDescriptor, and provides a
// Registered method which gets called after a downloaded layer is registered.
// This allows the user of the download manager to know the DiffID of each
// registered layer. This method is called if a cast to DigestRegisterer is
// successful.
type DigestRegisterer interface {
        // TODO existing implementations in distribution and builder-next swallow errors
        // when registering the diffID. Consider changing the Registered signature
        // to return the error.

        Registered(diffID layer.DiffID)
}

// Download is a blocking function which ensures the requested layers are
// present in the layer store. It uses the string returned by the Key method to
// deduplicate downloads. If a given layer is not already known to present in
// the layer store, and the key is not used by an in-progress download, the
// Download method is called to get the layer tar data. Layers are then
// registered in the appropriate order.  The caller must call the returned
// release function once it is done with the returned RootFS object.
func (ldm *LayerDownloadManager) Download(ctx context.Context, layers []DownloadDescriptor, progressOutput progress.Output) (image.RootFS, func(), error) {
        var (
                topLayer       layer.Layer
                topDownload    *downloadTransfer
                xferWatcher    *watcher
                missingLayer   bool
                transferKey    = ""
                downloadsByKey = make(map[string]*downloadTransfer)
        )

        rootFS := image.RootFS{Type: image.TypeLayers}
        for _, descriptor := range layers {
                key := descriptor.Key()
                transferKey += key

                if !missingLayer {
                        missingLayer = true
                        diffID, err := descriptor.DiffID()
                        if err == nil {
                                getRootFS := rootFS
                                getRootFS.Append(diffID)
                                l, err := ldm.layerStore.Get(getRootFS.ChainID())
                                if err == nil {
                                        // Layer already exists.
                                        log.G(ctx).Debugf("Layer already exists: %s", descriptor.ID())
                                        progress.Update(progressOutput, descriptor.ID(), "Already exists")
                                        if topLayer != nil {
                                                layer.ReleaseAndLog(ldm.layerStore, topLayer)
                                        }
                                        topLayer = l
                                        missingLayer = false
                                        rootFS.Append(diffID)
                                        // Register this repository as a source of this layer.
                                        if withRegistered, ok := descriptor.(DigestRegisterer); ok { // As layerstore may set the driver
                                                withRegistered.Registered(diffID)
                                        }
                                        continue
                                }
                        }
                }

                // Does this layer have the same data as a previous layer in
                // the stack? If so, avoid downloading it more than once.
                var topDownloadUncasted transfer
                if existingDownload, ok := downloadsByKey[key]; ok {
                        xferFunc := ldm.makeDownloadFuncFromDownload(descriptor, existingDownload, topDownload)
                        defer topDownload.transfer.release(xferWatcher)
                        topDownloadUncasted, xferWatcher = ldm.tm.transfer(transferKey, xferFunc, progressOutput)
                        topDownload = topDownloadUncasted.(*downloadTransfer)
                        continue
                }

                // Layer is not known to exist - download and register it.
                progress.Update(progressOutput, descriptor.ID(), "Pulling fs layer")

                var xferFunc doFunc
                if topDownload != nil {
                        xferFunc = ldm.makeDownloadFunc(descriptor, "", topDownload)
                        defer topDownload.transfer.release(xferWatcher)
                } else {
                        xferFunc = ldm.makeDownloadFunc(descriptor, rootFS.ChainID(), nil)
                }
                topDownloadUncasted, xferWatcher = ldm.tm.transfer(transferKey, xferFunc, progressOutput)
                topDownload = topDownloadUncasted.(*downloadTransfer)
                downloadsByKey[key] = topDownload
        }

        if topDownload == nil {
                return rootFS, func() {
                        if topLayer != nil {
                                layer.ReleaseAndLog(ldm.layerStore, topLayer)
                        }
                }, nil
        }

        // Won't be using the list built up so far - will generate it
        // from downloaded layers instead.
        rootFS.DiffIDs = []layer.DiffID{}

        defer func() {
                if topLayer != nil {
                        layer.ReleaseAndLog(ldm.layerStore, topLayer)
                }
        }()

        select {
        case <-ctx.Done():
                topDownload.transfer.release(xferWatcher)
                return rootFS, func() {}, ctx.Err()
        case <-topDownload.done():
                break
        }

        l, err := topDownload.result()
        if err != nil {
                topDownload.transfer.release(xferWatcher)
                return rootFS, func() {}, err
        }

        // Must do this exactly len(layers) times, so we don't include the
        // base layer on Windows.
        for range layers {
                if l == nil {
                        topDownload.transfer.release(xferWatcher)
                        return rootFS, func() {}, errors.New("internal error: too few parent layers")
                }
                rootFS.DiffIDs = append([]layer.DiffID{l.DiffID()}, rootFS.DiffIDs...)
                l = l.Parent()
        }
        return rootFS, func() { topDownload.transfer.release(xferWatcher) }, err
}

// makeDownloadFunc returns a function that performs the layer download and
// registration. If parentDownload is non-nil, it waits for that download to
// complete before the registration step, and registers the downloaded data
// on top of parentDownload's resulting layer. Otherwise, it registers the
// layer on top of the ChainID given by parentLayer.
func (ldm *LayerDownloadManager) makeDownloadFunc(descriptor DownloadDescriptor, parentLayer layer.ChainID, parentDownload *downloadTransfer) doFunc {
        return func(progressChan chan<- progress.Progress, start <-chan struct{}, inactive chan<- struct{}) transfer {
                d := &downloadTransfer{
                        transfer:   newTransfer(),
                        layerStore: ldm.layerStore,
                }

                go func() {
                        defer func() {
                                close(progressChan)
                        }()

                        progressOutput := progress.ChanOutput(progressChan)

                        select {
                        case <-start:
                        default:
                                progress.Update(progressOutput, descriptor.ID(), "Waiting")
                                <-start
                        }

                        if parentDownload != nil {
                                // Did the parent download already fail or get
                                // cancelled?
                                select {
                                case <-parentDownload.done():
                                        _, err := parentDownload.result()
                                        if err != nil {
                                                d.err = err
                                                return
                                        }
                                default:
                                }
                        }

                        var (
                                downloadReader io.ReadCloser
                                size           int64
                                err            error
                                attempt        = 1
                        )

                        defer descriptor.Close()

                        for {
                                downloadReader, size, err = descriptor.Download(d.transfer.context(), progressOutput)
                                if err == nil {
                                        break
                                }

                                // If an error was returned because the context
                                // was cancelled, we shouldn't retry.
                                select {
                                case <-d.transfer.context().Done():
                                        d.err = err
                                        return
                                default:
                                }

                                if _, isDNR := err.(DoNotRetry); isDNR || attempt >= ldm.maxDownloadAttempts {
                                        log.G(context.TODO()).Errorf("Download failed after %d attempts: %v", attempt, err)
                                        d.err = err
                                        return
                                }

                                log.G(context.TODO()).Infof("Download failed, retrying (%d/%d): %v", attempt, ldm.maxDownloadAttempts, err)
                                delay := attempt * 5
                                ticker := time.NewTicker(ldm.waitDuration)
                                attempt++

                        selectLoop:
                                for {
                                        progress.Updatef(progressOutput, descriptor.ID(), "Retrying in %d second%s", delay, (map[bool]string{true: "s"})[delay != 1])
                                        select {
                                        case <-ticker.C:
                                                delay--
                                                if delay == 0 {
                                                        ticker.Stop()
                                                        break selectLoop
                                                }
                                        case <-d.transfer.context().Done():
                                                ticker.Stop()
                                                d.err = errors.New("download cancelled during retry delay")
                                                return
                                        }
                                }
                        }

                        close(inactive)

                        if parentDownload != nil {
                                select {
                                case <-d.transfer.context().Done():
                                        d.err = errors.New("layer registration cancelled")
                                        downloadReader.Close()
                                        return
                                case <-parentDownload.done():
                                }

                                l, err := parentDownload.result()
                                if err != nil {
                                        d.err = err
                                        downloadReader.Close()
                                        return
                                }
                                parentLayer = l.ChainID()
                        }

                        reader := progress.NewProgressReader(ioutils.NewCancelReadCloser(d.transfer.context(), downloadReader), progressOutput, size, descriptor.ID(), "Extracting")
                        defer reader.Close()

                        inflatedLayerData, err := compression.DecompressStream(reader)
                        if err != nil {
                                d.err = fmt.Errorf("could not get decompression stream: %v", err)
                                return
                        }
                        defer inflatedLayerData.Close()

                        var src distribution.Descriptor
                        if fs, ok := descriptor.(distribution.Describable); ok {
                                src = fs.Descriptor()
                        }
                        if ds, ok := d.layerStore.(layer.DescribableStore); ok {
                                d.layer, err = ds.RegisterWithDescriptor(inflatedLayerData, parentLayer, src)
                        } else {
                                d.layer, err = d.layerStore.Register(inflatedLayerData, parentLayer)
                        }
                        if err != nil {
                                select {
                                case <-d.transfer.context().Done():
                                        d.err = errors.New("layer registration cancelled")
                                default:
                                        d.err = fmt.Errorf("failed to register layer: %v", err)
                                }
                                return
                        }

                        progress.Update(progressOutput, descriptor.ID(), "Pull complete")

                        if withRegistered, ok := descriptor.(DigestRegisterer); ok {
                                withRegistered.Registered(d.layer.DiffID())
                        }

                        // Doesn't actually need to be its own goroutine, but
                        // done like this so we can defer close(c).
                        go func() {
                                <-d.transfer.released()
                                if d.layer != nil {
                                        layer.ReleaseAndLog(d.layerStore, d.layer)
                                }
                        }()
                }()

                return d
        }
}

// makeDownloadFuncFromDownload returns a function that performs the layer
// registration when the layer data is coming from an existing download. It
// waits for sourceDownload and parentDownload to complete, and then
// reregisters the data from sourceDownload's top layer on top of
// parentDownload. This function does not log progress output because it would
// interfere with the progress reporting for sourceDownload, which has the same
// Key.
func (ldm *LayerDownloadManager) makeDownloadFuncFromDownload(descriptor DownloadDescriptor, sourceDownload *downloadTransfer, parentDownload *downloadTransfer) doFunc {
        return func(progressChan chan<- progress.Progress, start <-chan struct{}, inactive chan<- struct{}) transfer {
                d := &downloadTransfer{
                        transfer:   newTransfer(),
                        layerStore: ldm.layerStore,
                }

                go func() {
                        defer func() {
                                close(progressChan)
                        }()

                        <-start

                        close(inactive)

                        select {
                        case <-d.transfer.context().Done():
                                d.err = errors.New("layer registration cancelled")
                                return
                        case <-parentDownload.done():
                        }

                        l, err := parentDownload.result()
                        if err != nil {
                                d.err = err
                                return
                        }
                        parentLayer := l.ChainID()

                        // sourceDownload should have already finished if
                        // parentDownload finished, but wait for it explicitly
                        // to be sure.
                        select {
                        case <-d.transfer.context().Done():
                                d.err = errors.New("layer registration cancelled")
                                return
                        case <-sourceDownload.done():
                        }

                        l, err = sourceDownload.result()
                        if err != nil {
                                d.err = err
                                return
                        }

                        layerReader, err := l.TarStream()
                        if err != nil {
                                d.err = err
                                return
                        }
                        defer layerReader.Close()

                        var src distribution.Descriptor
                        if fs, ok := l.(distribution.Describable); ok {
                                src = fs.Descriptor()
                        }
                        if ds, ok := d.layerStore.(layer.DescribableStore); ok {
                                d.layer, err = ds.RegisterWithDescriptor(layerReader, parentLayer, src)
                        } else {
                                d.layer, err = d.layerStore.Register(layerReader, parentLayer)
                        }
                        if err != nil {
                                d.err = fmt.Errorf("failed to register layer: %v", err)
                                return
                        }

                        if withRegistered, ok := descriptor.(DigestRegisterer); ok {
                                withRegistered.Registered(d.layer.DiffID())
                        }

                        // Doesn't actually need to be its own goroutine, but
                        // done like this so we can defer close(c).
                        go func() {
                                <-d.transfer.released()
                                if d.layer != nil {
                                        layer.ReleaseAndLog(d.layerStore, d.layer)
                                }
                        }()
                }()

                return d
        }
}

package xfer

import (
        "context"
        "runtime"
        "sync"

        "github.com/docker/docker/pkg/progress"
        "github.com/pkg/errors"
)

// DoNotRetry is an error wrapper indicating that the error cannot be resolved
// with a retry.
type DoNotRetry struct {
        Err error
}

// Error returns the stringified representation of the encapsulated error.
func (e DoNotRetry) Error() string {
        return e.Err.Error()
}

// IsDoNotRetryError returns true if the error is caused by DoNotRetry error,
// and the transfer should not be retried.
func IsDoNotRetryError(err error) bool {
        var dnr DoNotRetry
        return errors.As(err, &dnr)
}

// watcher is returned by Watch and can be passed to Release to stop watching.
type watcher struct {
        // signalChan is used to signal to the watcher goroutine that
        // new progress information is available, or that the transfer
        // has finished.
        signalChan chan struct{}
        // releaseChan signals to the watcher goroutine that the watcher
        // should be detached.
        releaseChan chan struct{}
        // running remains open as long as the watcher is watching the
        // transfer. It gets closed if the transfer finishes or the
        // watcher is detached.
        running chan struct{}
}

// transfer represents an in-progress transfer.
type transfer interface {
        watch(progressOutput progress.Output) *watcher
        release(*watcher)
        context() context.Context
        close()
        done() <-chan struct{}
        released() <-chan struct{}
        broadcast(mainProgressChan <-chan progress.Progress)
}

type xfer struct {
        mu sync.Mutex

        ctx    context.Context
        cancel context.CancelFunc

        // watchers keeps track of the goroutines monitoring progress output,
        // indexed by the channels that release them.
        watchers map[chan struct{}]*watcher

        // lastProgress is the most recently received progress event.
        lastProgress progress.Progress
        // hasLastProgress is true when lastProgress has been set.
        hasLastProgress bool

        // running remains open as long as the transfer is in progress.
        running chan struct{}
        // releasedChan stays open until all watchers release the transfer and
        // the transfer is no longer tracked by the transferManager.
        releasedChan chan struct{}

        // broadcastDone is true if the main progress channel has closed.
        broadcastDone bool
        // closed is true if Close has been called
        closed bool
        // broadcastSyncChan allows watchers to "ping" the broadcasting
        // goroutine to wait for it for deplete its input channel. This ensures
        // a detaching watcher won't miss an event that was sent before it
        // started detaching.
        broadcastSyncChan chan struct{}
}

// newTransfer creates a new transfer.
func newTransfer() transfer {
        t := &xfer{
                watchers:          make(map[chan struct{}]*watcher),
                running:           make(chan struct{}),
                releasedChan:      make(chan struct{}),
                broadcastSyncChan: make(chan struct{}),
        }

        // This uses context.Background instead of a caller-supplied context
        // so that a transfer won't be cancelled automatically if the client
        // which requested it is ^C'd (there could be other viewers).
        t.ctx, t.cancel = context.WithCancel(context.Background())

        return t
}

// Broadcast copies the progress and error output to all viewers.
func (t *xfer) broadcast(mainProgressChan <-chan progress.Progress) {
        for {
                var (
                        p  progress.Progress
                        ok bool
                )
                select {
                case p, ok = <-mainProgressChan:
                default:
                        // We've depleted the channel, so now we can handle
                        // reads on broadcastSyncChan to let detaching watchers
                        // know we're caught up.
                        select {
                        case <-t.broadcastSyncChan:
                                continue
                        case p, ok = <-mainProgressChan:
                        }
                }

                t.mu.Lock()
                if ok {
                        t.lastProgress = p
                        t.hasLastProgress = true
                        for _, w := range t.watchers {
                                select {
                                case w.signalChan <- struct{}{}:
                                default:
                                }
                        }
                } else {
                        t.broadcastDone = true
                }
                t.mu.Unlock()
                if !ok {
                        close(t.running)
                        return
                }
        }
}

// Watch adds a watcher to the transfer. The supplied channel gets progress
// updates and is closed when the transfer finishes.
func (t *xfer) watch(progressOutput progress.Output) *watcher {
        t.mu.Lock()
        defer t.mu.Unlock()

        w := &watcher{
                releaseChan: make(chan struct{}),
                signalChan:  make(chan struct{}),
                running:     make(chan struct{}),
        }

        t.watchers[w.releaseChan] = w

        if t.broadcastDone {
                close(w.running)
                return w
        }

        go func() {
                defer func() {
                        close(w.running)
                }()
                var (
                        done           bool
                        lastWritten    progress.Progress
                        hasLastWritten bool
                )
                for {
                        t.mu.Lock()
                        hasLastProgress := t.hasLastProgress
                        lastProgress := t.lastProgress
                        t.mu.Unlock()

                        // Make sure we don't write the last progress item
                        // twice.
                        if hasLastProgress && (!done || !hasLastWritten || lastProgress != lastWritten) {
                                progressOutput.WriteProgress(lastProgress)
                                lastWritten = lastProgress
                                hasLastWritten = true
                        }

                        if done {
                                return
                        }

                        select {
                        case <-w.signalChan:
                        case <-w.releaseChan:
                                done = true
                                // Since the watcher is going to detach, make
                                // sure the broadcaster is caught up so we
                                // don't miss anything.
                                select {
                                case t.broadcastSyncChan <- struct{}{}:
                                case <-t.running:
                                }
                        case <-t.running:
                                done = true
                        }
                }
        }()

        return w
}

// Release is the inverse of Watch; indicating that the watcher no longer wants
// to be notified about the progress of the transfer. All calls to Watch must
// be paired with later calls to Release so that the lifecycle of the transfer
// is properly managed.
func (t *xfer) release(watcher *watcher) {
        t.mu.Lock()
        delete(t.watchers, watcher.releaseChan)

        if len(t.watchers) == 0 {
                if t.closed {
                        // released may have been closed already if all
                        // watchers were released, then another one was added
                        // while waiting for a previous watcher goroutine to
                        // finish.
                        select {
                        case <-t.releasedChan:
                        default:
                                close(t.releasedChan)
                        }
                } else {
                        t.cancel()
                }
        }
        t.mu.Unlock()

        close(watcher.releaseChan)
        // Block until the watcher goroutine completes
        <-watcher.running
}

// Done returns a channel which is closed if the transfer completes or is
// cancelled. Note that having 0 watchers causes a transfer to be cancelled.
func (t *xfer) done() <-chan struct{} {
        // Note that this doesn't return t.ctx.Done() because that channel will
        // be closed the moment Cancel is called, and we need to return a
        // channel that blocks until a cancellation is actually acknowledged by
        // the transfer function.
        return t.running
}

// Released returns a channel which is closed once all watchers release the
// transfer AND the transfer is no longer tracked by the transferManager.
func (t *xfer) released() <-chan struct{} {
        return t.releasedChan
}

// Context returns the context associated with the transfer.
func (t *xfer) context() context.Context {
        return t.ctx
}

// Close is called by the transferManager when the transfer is no longer
// being tracked.
func (t *xfer) close() {
        t.mu.Lock()
        t.closed = true
        if len(t.watchers) == 0 {
                close(t.releasedChan)
        }
        t.mu.Unlock()
}

// doFunc is a function called by the transferManager to actually perform
// a transfer. It should be non-blocking. It should wait until the start channel
// is closed before transferring any data. If the function closes inactive, that
// signals to the transferManager that the job is no longer actively moving
// data - for example, it may be waiting for a dependent transfer to finish.
// This prevents it from taking up a slot.
type doFunc func(progressChan chan<- progress.Progress, start <-chan struct{}, inactive chan<- struct{}) transfer

// transferManager is used by LayerDownloadManager and LayerUploadManager to
// schedule and deduplicate transfers. It is up to the transferManager
// to make the scheduling and concurrency decisions.
type transferManager struct {
        mu sync.Mutex

        concurrencyLimit int
        activeTransfers  int
        transfers        map[string]transfer
        waitingTransfers []chan struct{}
}

// newTransferManager returns a new transferManager.
func newTransferManager(concurrencyLimit int) *transferManager {
        return &transferManager{
                concurrencyLimit: concurrencyLimit,
                transfers:        make(map[string]transfer),
        }
}

// setConcurrency sets the concurrencyLimit
func (tm *transferManager) setConcurrency(concurrency int) {
        tm.mu.Lock()
        tm.concurrencyLimit = concurrency
        tm.mu.Unlock()
}

// transfer checks if a transfer matching the given key is in progress. If not,
// it starts one by calling xferFunc. The caller supplies a channel which
// receives progress output from the transfer.
func (tm *transferManager) transfer(key string, xferFunc doFunc, progressOutput progress.Output) (transfer, *watcher) {
        tm.mu.Lock()
        defer tm.mu.Unlock()

        for {
                xfer, present := tm.transfers[key]
                if !present {
                        break
                }
                // transfer is already in progress.
                watcher := xfer.watch(progressOutput)

                select {
                case <-xfer.context().Done():
                        // We don't want to watch a transfer that has been cancelled.
                        // Wait for it to be removed from the map and try again.
                        xfer.release(watcher)
                        tm.mu.Unlock()
                        // The goroutine that removes this transfer from the
                        // map is also waiting for xfer.Done(), so yield to it.
                        // This could be avoided by adding a Closed method
                        // to transfer to allow explicitly waiting for it to be
                        // removed the map, but forcing a scheduling round in
                        // this very rare case seems better than bloating the
                        // interface definition.
                        runtime.Gosched()
                        <-xfer.done()
                        tm.mu.Lock()
                default:
                        return xfer, watcher
                }
        }

        start := make(chan struct{})
        inactive := make(chan struct{})

        if tm.concurrencyLimit == 0 || tm.activeTransfers < tm.concurrencyLimit {
                close(start)
                tm.activeTransfers++
        } else {
                tm.waitingTransfers = append(tm.waitingTransfers, start)
        }

        mainProgressChan := make(chan progress.Progress)
        xfer := xferFunc(mainProgressChan, start, inactive)
        watcher := xfer.watch(progressOutput)
        go xfer.broadcast(mainProgressChan)
        tm.transfers[key] = xfer

        // When the transfer is finished, remove from the map.
        go func() {
                for {
                        select {
                        case <-inactive:
                                tm.mu.Lock()
                                tm.inactivate(start)
                                tm.mu.Unlock()
                                inactive = nil
                        case <-xfer.done():
                                tm.mu.Lock()
                                if inactive != nil {
                                        tm.inactivate(start)
                                }
                                delete(tm.transfers, key)
                                tm.mu.Unlock()
                                xfer.close()
                                return
                        }
                }
        }()

        return xfer, watcher
}

func (tm *transferManager) inactivate(start chan struct{}) {
        // If the transfer was started, remove it from the activeTransfers
        // count.
        select {
        case <-start:
                // Start next transfer if any are waiting
                if len(tm.waitingTransfers) != 0 {
                        close(tm.waitingTransfers[0])
                        tm.waitingTransfers = tm.waitingTransfers[1:]
                } else {
                        tm.activeTransfers--
                }
        default:
        }
}

package xfer

import (
        "context"
        "errors"
        "time"

        "github.com/containerd/log"
        "github.com/docker/distribution"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/progress"
)

const maxUploadAttempts = 5

// LayerUploadManager provides task management and progress reporting for
// uploads.
type LayerUploadManager struct {
        tm           *transferManager
        waitDuration time.Duration
}

// SetConcurrency sets the max concurrent uploads for each push
func (lum *LayerUploadManager) SetConcurrency(concurrency int) {
        lum.tm.setConcurrency(concurrency)
}

// NewLayerUploadManager returns a new LayerUploadManager.
func NewLayerUploadManager(concurrencyLimit int, options ...func(*LayerUploadManager)) *LayerUploadManager {
        manager := LayerUploadManager{
                tm:           newTransferManager(concurrencyLimit),
                waitDuration: time.Second,
        }
        for _, option := range options {
                option(&manager)
        }
        return &manager
}

type uploadTransfer struct {
        transfer

        remoteDescriptor distribution.Descriptor
        err              error
}

// An UploadDescriptor references a layer that may need to be uploaded.
type UploadDescriptor interface {
        // Key returns the key used to deduplicate uploads.
        Key() string
        // ID returns the ID for display purposes.
        ID() string
        // DiffID should return the DiffID for this layer.
        DiffID() layer.DiffID
        // Upload is called to perform the Upload.
        Upload(ctx context.Context, progressOutput progress.Output) (distribution.Descriptor, error)
        // SetRemoteDescriptor provides the distribution.Descriptor that was
        // returned by Upload. This descriptor is not to be confused with
        // the UploadDescriptor interface, which is used for internally
        // identifying layers that are being uploaded.
        SetRemoteDescriptor(descriptor distribution.Descriptor)
}

// Upload is a blocking function which ensures the listed layers are present on
// the remote registry. It uses the string returned by the Key method to
// deduplicate uploads.
func (lum *LayerUploadManager) Upload(ctx context.Context, layers []UploadDescriptor, progressOutput progress.Output) error {
        var (
                uploads          []*uploadTransfer
                dedupDescriptors = make(map[string]*uploadTransfer)
        )

        for _, descriptor := range layers {
                progress.Update(progressOutput, descriptor.ID(), "Preparing")

                key := descriptor.Key()
                if _, present := dedupDescriptors[key]; present {
                        continue
                }

                xferFunc := lum.makeUploadFunc(descriptor)
                upload, watcher := lum.tm.transfer(descriptor.Key(), xferFunc, progressOutput)
                defer upload.release(watcher)
                uploads = append(uploads, upload.(*uploadTransfer))
                dedupDescriptors[key] = upload.(*uploadTransfer)
        }

        for _, upload := range uploads {
                select {
                case <-ctx.Done():
                        return ctx.Err()
                case <-upload.transfer.done():
                        if upload.err != nil {
                                return upload.err
                        }
                }
        }
        for _, l := range layers {
                l.SetRemoteDescriptor(dedupDescriptors[l.Key()].remoteDescriptor)
        }

        return nil
}

func (lum *LayerUploadManager) makeUploadFunc(descriptor UploadDescriptor) doFunc {
        return func(progressChan chan<- progress.Progress, start <-chan struct{}, inactive chan<- struct{}) transfer {
                u := &uploadTransfer{
                        transfer: newTransfer(),
                }

                go func() {
                        defer func() {
                                close(progressChan)
                        }()

                        progressOutput := progress.ChanOutput(progressChan)

                        select {
                        case <-start:
                        default:
                                progress.Update(progressOutput, descriptor.ID(), "Waiting")
                                <-start
                        }

                        retries := 0
                        for {
                                remoteDescriptor, err := descriptor.Upload(u.transfer.context(), progressOutput)
                                if err == nil {
                                        u.remoteDescriptor = remoteDescriptor
                                        break
                                }

                                // If an error was returned because the context
                                // was cancelled, we shouldn't retry.
                                select {
                                case <-u.transfer.context().Done():
                                        u.err = err
                                        return
                                default:
                                }

                                retries++
                                if _, isDNR := err.(DoNotRetry); isDNR || retries == maxUploadAttempts {
                                        log.G(context.TODO()).Errorf("Upload failed: %v", err)
                                        u.err = err
                                        return
                                }

                                log.G(context.TODO()).Errorf("Upload failed, retrying: %v", err)
                                delay := retries * 5
                                ticker := time.NewTicker(lum.waitDuration)

                        selectLoop:
                                for {
                                        progress.Updatef(progressOutput, descriptor.ID(), "Retrying in %d second%s", delay, (map[bool]string{true: "s"})[delay != 1])
                                        select {
                                        case <-ticker.C:
                                                delay--
                                                if delay == 0 {
                                                        ticker.Stop()
                                                        break selectLoop
                                                }
                                        case <-u.transfer.context().Done():
                                                ticker.Stop()
                                                u.err = errors.New("upload cancelled during retry delay")
                                                return
                                        }
                                }
                        }
                }()

                return u
        }
}

package dockerversion

import (
        "context"
        "fmt"
        "runtime"
        "sync"

        "github.com/docker/docker/pkg/parsers/kernel"
        "github.com/docker/docker/pkg/useragent"
)

// UAStringKey is used as key type for user-agent string in net/context struct
type UAStringKey struct{}

// DockerUserAgent is the User-Agent the Docker client uses to identify itself.
// In accordance with RFC 7231 (5.5.3) is of the form:
//
//        [docker client's UA] UpstreamClient([upstream client's UA])
func DockerUserAgent(ctx context.Context, extraVersions ...useragent.VersionInfo) string {
        ua := useragent.AppendVersions(getDaemonUserAgent(), extraVersions...)
        if upstreamUA := getUpstreamUserAgent(ctx); upstreamUA != "" {
                ua += " " + upstreamUA
        }
        return ua
}

var (
        daemonUAOnce sync.Once
        daemonUA     string
)

// getDaemonUserAgent returns the user-agent to use for requests made by
// the daemon.
//
// It includes;
//
// - the docker version
// - go version
// - git-commit
// - kernel version
// - os
// - architecture
func getDaemonUserAgent() string {
        daemonUAOnce.Do(func() {
                httpVersion := make([]useragent.VersionInfo, 0, 6)
                httpVersion = append(httpVersion, useragent.VersionInfo{Name: "docker", Version: Version})
                httpVersion = append(httpVersion, useragent.VersionInfo{Name: "go", Version: runtime.Version()})
                httpVersion = append(httpVersion, useragent.VersionInfo{Name: "git-commit", Version: GitCommit})
                if kernelVersion, err := kernel.GetKernelVersion(); err == nil {
                        httpVersion = append(httpVersion, useragent.VersionInfo{Name: "kernel", Version: kernelVersion.String()})
                }
                httpVersion = append(httpVersion, useragent.VersionInfo{Name: "os", Version: runtime.GOOS})
                httpVersion = append(httpVersion, useragent.VersionInfo{Name: "arch", Version: runtime.GOARCH})
                daemonUA = useragent.AppendVersions("", httpVersion...)
        })
        return daemonUA
}

// getUpstreamUserAgent returns the previously saved user-agent context stored
// in ctx, if one exists, and formats it as:
//
//        UpstreamClient(<upstream user agent string>)
//
// It returns an empty string if no user-agent is present in the context.
func getUpstreamUserAgent(ctx context.Context) string {
        var upstreamUA string
        if ctx != nil {
                if ki := ctx.Value(UAStringKey{}); ki != nil {
                        upstreamUA = ctx.Value(UAStringKey{}).(string)
                }
        }
        if upstreamUA == "" {
                return ""
        }
        return fmt.Sprintf("UpstreamClient(%s)", escapeStr(upstreamUA))
}

const charsToEscape = `();\`

// escapeStr returns s with every rune in charsToEscape escaped by a backslash
func escapeStr(s string) string {
        var ret string
        for _, currRune := range s {
                appended := false
                for _, escapableRune := range charsToEscape {
                        if currRune == escapableRune {
                                ret += `\` + string(currRune)
                                appended = true
                                break
                        }
                }
                if !appended {
                        ret += string(currRune)
                }
        }
        return ret
}

package errdefs

import (
        "context"

        cerrdefs "github.com/containerd/errdefs"
)

type errNotFound struct{ error }

func (errNotFound) NotFound() {}

func (e errNotFound) Cause() error {
        return e.error
}

func (e errNotFound) Unwrap() error {
        return e.error
}

// NotFound creates an [ErrNotFound] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrNotFound],
func NotFound(err error) error {
        if err == nil || cerrdefs.IsNotFound(err) {
                return err
        }
        return errNotFound{err}
}

type errInvalidParameter struct{ error }

func (errInvalidParameter) InvalidParameter() {}

func (e errInvalidParameter) Cause() error {
        return e.error
}

func (e errInvalidParameter) Unwrap() error {
        return e.error
}

// InvalidParameter creates an [ErrInvalidParameter] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrInvalidParameter],
func InvalidParameter(err error) error {
        if err == nil || cerrdefs.IsInvalidArgument(err) {
                return err
        }
        return errInvalidParameter{err}
}

type errConflict struct{ error }

func (errConflict) Conflict() {}

func (e errConflict) Cause() error {
        return e.error
}

func (e errConflict) Unwrap() error {
        return e.error
}

// Conflict creates an [ErrConflict] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrConflict],
func Conflict(err error) error {
        if err == nil || cerrdefs.IsConflict(err) {
                return err
        }
        return errConflict{err}
}

type errUnauthorized struct{ error }

func (errUnauthorized) Unauthorized() {}

func (e errUnauthorized) Cause() error {
        return e.error
}

func (e errUnauthorized) Unwrap() error {
        return e.error
}

// Unauthorized creates an [ErrUnauthorized] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrUnauthorized],
func Unauthorized(err error) error {
        if err == nil || cerrdefs.IsUnauthorized(err) {
                return err
        }
        return errUnauthorized{err}
}

type errUnavailable struct{ error }

func (errUnavailable) Unavailable() {}

func (e errUnavailable) Cause() error {
        return e.error
}

func (e errUnavailable) Unwrap() error {
        return e.error
}

// Unavailable creates an [ErrUnavailable] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrUnavailable],
func Unavailable(err error) error {
        if err == nil || cerrdefs.IsUnavailable(err) {
                return err
        }
        return errUnavailable{err}
}

type errForbidden struct{ error }

func (errForbidden) Forbidden() {}

func (e errForbidden) Cause() error {
        return e.error
}

func (e errForbidden) Unwrap() error {
        return e.error
}

// Forbidden creates an [ErrForbidden] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrForbidden],
func Forbidden(err error) error {
        if err == nil || cerrdefs.IsPermissionDenied(err) {
                return err
        }
        return errForbidden{err}
}

type errSystem struct{ error }

func (errSystem) System() {}

func (e errSystem) Cause() error {
        return e.error
}

func (e errSystem) Unwrap() error {
        return e.error
}

// System creates an [ErrSystem] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrSystem],
func System(err error) error {
        if err == nil || cerrdefs.IsInternal(err) {
                return err
        }
        return errSystem{err}
}

type errNotModified struct{ error }

func (errNotModified) NotModified() {}

func (e errNotModified) Cause() error {
        return e.error
}

func (e errNotModified) Unwrap() error {
        return e.error
}

// NotModified creates an [ErrNotModified] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [NotModified],
func NotModified(err error) error {
        if err == nil || cerrdefs.IsNotModified(err) {
                return err
        }
        return errNotModified{err}
}

type errNotImplemented struct{ error }

func (errNotImplemented) NotImplemented() {}

func (e errNotImplemented) Cause() error {
        return e.error
}

func (e errNotImplemented) Unwrap() error {
        return e.error
}

// NotImplemented creates an [ErrNotImplemented] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrNotImplemented],
func NotImplemented(err error) error {
        if err == nil || cerrdefs.IsNotImplemented(err) {
                return err
        }
        return errNotImplemented{err}
}

type errUnknown struct{ error }

func (errUnknown) Unknown() {}

func (e errUnknown) Cause() error {
        return e.error
}

func (e errUnknown) Unwrap() error {
        return e.error
}

// Unknown creates an [ErrUnknown] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrUnknown],
func Unknown(err error) error {
        if err == nil || cerrdefs.IsUnknown(err) {
                return err
        }
        return errUnknown{err}
}

type errCancelled struct{ error }

func (errCancelled) Cancelled() {}

func (e errCancelled) Cause() error {
        return e.error
}

func (e errCancelled) Unwrap() error {
        return e.error
}

// Cancelled creates an [ErrCancelled] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrCancelled],
func Cancelled(err error) error {
        if err == nil || cerrdefs.IsCanceled(err) {
                return err
        }
        return errCancelled{err}
}

type errDeadline struct{ error }

func (errDeadline) DeadlineExceeded() {}

func (e errDeadline) Cause() error {
        return e.error
}

func (e errDeadline) Unwrap() error {
        return e.error
}

// Deadline creates an [ErrDeadline] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrDeadline],
func Deadline(err error) error {
        if err == nil || cerrdefs.IsDeadlineExceeded(err) {
                return err
        }
        return errDeadline{err}
}

type errDataLoss struct{ error }

func (errDataLoss) DataLoss() {}

func (e errDataLoss) Cause() error {
        return e.error
}

func (e errDataLoss) Unwrap() error {
        return e.error
}

// DataLoss creates an [ErrDataLoss] error from the given error.
// It returns the error as-is if it is either nil (no error) or already implements
// [ErrDataLoss],
func DataLoss(err error) error {
        if err == nil || cerrdefs.IsDataLoss(err) {
                return err
        }
        return errDataLoss{err}
}

// FromContext returns the error class from the passed in context
func FromContext(ctx context.Context) error {
        e := ctx.Err()
        if e == nil {
                return nil
        }

        if e == context.Canceled {
                return Cancelled(e)
        }
        if e == context.DeadlineExceeded {
                return Deadline(e)
        }
        return Unknown(e)
}

package errdefs

import (
        "net/http"
)

// FromStatusCode creates an errdef error, based on the provided HTTP status-code
//
// Deprecated: Use [cerrdefs.ToNative] instead
func FromStatusCode(err error, statusCode int) error {
        if err == nil {
                return nil
        }
        switch statusCode {
        case http.StatusNotFound:
                return NotFound(err)
        case http.StatusBadRequest:
                return InvalidParameter(err)
        case http.StatusConflict:
                return Conflict(err)
        case http.StatusUnauthorized:
                return Unauthorized(err)
        case http.StatusServiceUnavailable:
                return Unavailable(err)
        case http.StatusForbidden:
                return Forbidden(err)
        case http.StatusNotModified:
                return NotModified(err)
        case http.StatusNotImplemented:
                return NotImplemented(err)
        case http.StatusInternalServerError:
                if IsCancelled(err) || IsSystem(err) || IsUnknown(err) || IsDataLoss(err) || IsDeadline(err) {
                        return err
                }
                return System(err)
        default:
                switch {
                case statusCode >= http.StatusOK && statusCode < http.StatusBadRequest:
                        // it's a client error
                        return err
                case statusCode >= http.StatusBadRequest && statusCode < http.StatusInternalServerError:
                        return InvalidParameter(err)
                case statusCode >= http.StatusInternalServerError && statusCode < 600:
                        return System(err)
                default:
                        return Unknown(err)
                }
        }
}

package errdefs

import (
        "context"
        "errors"

        cerrdefs "github.com/containerd/errdefs"
)

// IsNotFound returns if the passed in error is an [ErrNotFound],
//
// Deprecated: use containerd [cerrdefs.IsNotFound]
var IsNotFound = cerrdefs.IsNotFound

// IsInvalidParameter returns if the passed in error is an [ErrInvalidParameter].
//
// Deprecated: use containerd [cerrdefs.IsInvalidArgument]
var IsInvalidParameter = cerrdefs.IsInvalidArgument

// IsConflict returns if the passed in error is an [ErrConflict].
//
// Deprecated: use containerd [cerrdefs.IsConflict]
var IsConflict = cerrdefs.IsConflict

// IsUnauthorized returns if the passed in error is an [ErrUnauthorized].
//
// Deprecated: use containerd [cerrdefs.IsUnauthorized]
var IsUnauthorized = cerrdefs.IsUnauthorized

// IsUnavailable returns if the passed in error is an [ErrUnavailable].
//
// Deprecated: use containerd [cerrdefs.IsUnavailable]
var IsUnavailable = cerrdefs.IsUnavailable

// IsForbidden returns if the passed in error is an [ErrForbidden].
//
// Deprecated: use containerd [cerrdefs.IsPermissionDenied]
var IsForbidden = cerrdefs.IsPermissionDenied

// IsSystem returns if the passed in error is an [ErrSystem].
//
// Deprecated: use containerd [cerrdefs.IsInternal]
var IsSystem = cerrdefs.IsInternal

// IsNotModified returns if the passed in error is an [ErrNotModified].
//
// Deprecated: use containerd [cerrdefs.IsNotModified]
var IsNotModified = cerrdefs.IsNotModified

// IsNotImplemented returns if the passed in error is an [ErrNotImplemented].
//
// Deprecated: use containerd [cerrdefs.IsNotImplemented]
var IsNotImplemented = cerrdefs.IsNotImplemented

// IsUnknown returns if the passed in error is an [ErrUnknown].
//
// Deprecated: use containerd [cerrdefs.IsUnknown]
var IsUnknown = cerrdefs.IsUnknown

// IsCancelled returns if the passed in error is an [ErrCancelled].
//
// Deprecated: use containerd [cerrdefs.IsCanceled]
var IsCancelled = cerrdefs.IsCanceled

// IsDeadline returns if the passed in error is an [ErrDeadline].
//
// Deprecated: use containerd [cerrdefs.IsDeadlineExceeded]
var IsDeadline = cerrdefs.IsDeadlineExceeded

// IsDataLoss returns if the passed in error is an [ErrDataLoss].
//
// Deprecated: use containerd [cerrdefs.IsDataLoss]
var IsDataLoss = cerrdefs.IsDataLoss

// IsContext returns if the passed in error is due to context cancellation or deadline exceeded.
func IsContext(err error) bool {
        return errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)
}

package cache

import (
        "context"
        "fmt"
        "reflect"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/daemon/builder"
        "github.com/docker/docker/dockerversion"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        containertypes "github.com/moby/moby/api/types/container"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

type ImageCacheStore interface {
        Get(image.ID) (*image.Image, error)
        GetByRef(ctx context.Context, refOrId string) (*image.Image, error)
        SetParent(target, parent image.ID) error
        GetParent(target image.ID) (image.ID, error)
        Create(parent *image.Image, image image.Image, extraLayer layer.DiffID) (image.ID, error)
        IsBuiltLocally(id image.ID) (bool, error)
        Children(id image.ID) []image.ID
}

func New(ctx context.Context, store ImageCacheStore, cacheFrom []string) (builder.ImageCache, error) {
        local := &LocalImageCache{store: store}
        if len(cacheFrom) == 0 {
                return local, nil
        }

        cache := &ImageCache{
                store:           store,
                localImageCache: local,
        }

        for _, ref := range cacheFrom {
                img, err := store.GetByRef(ctx, ref)
                if err != nil {
                        if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                                return nil, err
                        }
                        log.G(ctx).Warnf("Could not look up %s for cache resolution, skipping: %+v", ref, err)
                        continue
                }
                cache.Populate(img)
        }

        return cache, nil
}

// LocalImageCache is cache based on parent chain.
type LocalImageCache struct {
        store ImageCacheStore
}

// GetCache returns the image id found in the cache
func (lic *LocalImageCache) GetCache(imgID string, config *containertypes.Config, platform ocispec.Platform) (string, error) {
        return getImageIDAndError(getLocalCachedImage(lic.store, image.ID(imgID), config, platform))
}

// ImageCache is cache based on history objects. Requires initial set of images.
type ImageCache struct {
        sources         []*image.Image
        store           ImageCacheStore
        localImageCache *LocalImageCache
}

// Populate adds an image to the cache (to be queried later)
func (ic *ImageCache) Populate(image *image.Image) {
        ic.sources = append(ic.sources, image)
}

// GetCache returns the image id found in the cache
func (ic *ImageCache) GetCache(parentID string, cfg *containertypes.Config, platform ocispec.Platform) (string, error) {
        imgID, err := ic.localImageCache.GetCache(parentID, cfg, platform)
        if err != nil {
                return "", err
        }
        if imgID != "" {
                for _, s := range ic.sources {
                        if ic.isParent(s.ID(), image.ID(imgID)) {
                                return imgID, nil
                        }
                }
        }

        var parent *image.Image
        lenHistory := 0
        if parentID != "" {
                parent, err = ic.store.Get(image.ID(parentID))
                if err != nil {
                        return "", errors.Wrapf(err, "unable to find image %v", parentID)
                }
                lenHistory = len(parent.History)
        }

        for _, target := range ic.sources {
                if !isValidParent(target, parent) || !isValidConfig(cfg, target.History[lenHistory]) {
                        continue
                }

                if len(target.History)-1 == lenHistory { // last
                        if parent != nil {
                                if err := ic.store.SetParent(target.ID(), parent.ID()); err != nil {
                                        return "", errors.Wrapf(err, "failed to set parent for %v to %v", target.ID(), parent.ID())
                                }
                        }
                        return target.ID().String(), nil
                }

                imgID, err := ic.restoreCachedImage(parent, target, cfg)
                if err != nil {
                        return "", errors.Wrapf(err, "failed to restore cached image from %q to %v", parentID, target.ID())
                }

                ic.sources = []*image.Image{target} // avoid jumping to different target, tuned for safety atm
                return imgID.String(), nil
        }

        return "", nil
}

func (ic *ImageCache) restoreCachedImage(parent, target *image.Image, cfg *containertypes.Config) (image.ID, error) {
        var history []image.History
        rootFS := image.NewRootFS()
        lenHistory := 0
        if parent != nil {
                history = parent.History
                rootFS = parent.RootFS
                lenHistory = len(parent.History)
        }
        history = append(history, target.History[lenHistory])
        layer := getLayerForHistoryIndex(target, lenHistory)
        if layer != "" {
                rootFS.Append(layer)
        }

        restoredImg := image.Image{
                V1Image: image.V1Image{
                        DockerVersion: dockerversion.Version,
                        Config:        cfg,
                        Architecture:  target.Architecture,
                        OS:            target.OS,
                        Author:        target.Author,
                        Created:       history[len(history)-1].Created,
                },
                RootFS:     rootFS,
                History:    history,
                OSFeatures: target.OSFeatures,
                OSVersion:  target.OSVersion,
        }

        imgID, err := ic.store.Create(parent, restoredImg, layer)
        if err != nil {
                return "", errors.Wrap(err, "failed to create cache image")
        }

        return imgID, nil
}

func (ic *ImageCache) isParent(imgID, parentID image.ID) bool {
        nextParent, err := ic.store.GetParent(imgID)
        if err != nil {
                return false
        }
        if nextParent == parentID {
                return true
        }
        return ic.isParent(nextParent, parentID)
}

func getLayerForHistoryIndex(image *image.Image, index int) layer.DiffID {
        layerIndex := 0
        for i, h := range image.History {
                if i == index {
                        if h.EmptyLayer {
                                return ""
                        }
                        break
                }
                if !h.EmptyLayer {
                        layerIndex++
                }
        }
        return image.RootFS.DiffIDs[layerIndex] // validate?
}

func isValidConfig(cfg *containertypes.Config, h image.History) bool {
        // todo: make this format better than join that loses data
        return strings.Join(cfg.Cmd, " ") == h.CreatedBy
}

func isValidParent(img, parent *image.Image) bool {
        if len(img.History) == 0 {
                return false
        }
        if parent == nil || len(parent.History) == 0 && len(parent.RootFS.DiffIDs) == 0 {
                return true
        }
        if len(parent.History) >= len(img.History) {
                return false
        }
        if len(parent.RootFS.DiffIDs) > len(img.RootFS.DiffIDs) {
                return false
        }

        for i, h := range parent.History {
                if !reflect.DeepEqual(h, img.History[i]) {
                        return false
                }
        }
        for i, d := range parent.RootFS.DiffIDs {
                if d != img.RootFS.DiffIDs[i] {
                        return false
                }
        }
        return true
}

func getImageIDAndError(img *image.Image, err error) (string, error) {
        if img == nil || err != nil {
                return "", err
        }
        return img.ID().String(), nil
}

// getLocalCachedImage returns the most recent created image that is a child
// of the image with imgID, that had the same config when it was
// created. nil is returned if a child cannot be found. An error is
// returned if the parent image cannot be found.
func getLocalCachedImage(imageStore ImageCacheStore, parentID image.ID, config *containertypes.Config, platform ocispec.Platform) (*image.Image, error) {
        if config == nil {
                return nil, nil
        }

        var match *image.Image
        for _, id := range imageStore.Children(parentID) {
                img, err := imageStore.Get(id)
                if err != nil {
                        return nil, fmt.Errorf("unable to find image %q", id)
                }

                builtLocally, err := imageStore.IsBuiltLocally(id)
                if err != nil {
                        log.G(context.TODO()).WithFields(log.Fields{
                                "error": err,
                                "id":    id,
                        }).Warn("failed to check if image was built locally")
                        continue
                }
                if !builtLocally {
                        continue
                }

                imgPlatform := img.Platform()
                // Discard old linux/amd64 images with empty platform.
                if imgPlatform.OS == "" && imgPlatform.Architecture == "" {
                        continue
                }
                if !comparePlatform(platform, imgPlatform) {
                        continue
                }

                if compare(&img.ContainerConfig, config) {
                        // check for the most up to date match
                        if img.Created != nil && (match == nil || match.Created.Before(*img.Created)) {
                                match = img
                        }
                }
        }
        return match, nil
}

package cache

import (
        "strings"

        "github.com/containerd/platforms"
        "github.com/moby/moby/api/types/container"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

func comparePlatform(builderPlatform, imagePlatform ocispec.Platform) bool {
        // On Windows, only check the Major and Minor versions.
        // The Build and Revision compatibility depends on whether `process` or
        // `hyperv` isolation used.
        //
        // Fixes https://github.com/moby/moby/issues/47307
        if builderPlatform.OS == "windows" && imagePlatform.OS == builderPlatform.OS {
                // OSVersion format is:
                // Major.Minor.Build.Revision
                builderParts := strings.Split(builderPlatform.OSVersion, ".")
                imageParts := strings.Split(imagePlatform.OSVersion, ".")

                if len(builderParts) >= 3 && len(imageParts) >= 3 {
                        // Keep only Major & Minor.
                        builderParts[0] = imageParts[0]
                        builderParts[1] = imageParts[1]
                        imagePlatform.OSVersion = strings.Join(builderParts, ".")
                }
        }

        return platforms.Only(builderPlatform).Match(imagePlatform)
}

// compare two Config struct. Do not container-specific fields:
// - Image
// - Hostname
// - Domainname
// - MacAddress
func compare(a, b *container.Config) bool {
        if a == nil || b == nil {
                return false
        }

        if len(a.Env) != len(b.Env) {
                return false
        }
        if len(a.Cmd) != len(b.Cmd) {
                return false
        }
        if len(a.Entrypoint) != len(b.Entrypoint) {
                return false
        }
        if len(a.Shell) != len(b.Shell) {
                return false
        }
        if len(a.ExposedPorts) != len(b.ExposedPorts) {
                return false
        }
        if len(a.Volumes) != len(b.Volumes) {
                return false
        }
        if len(a.Labels) != len(b.Labels) {
                return false
        }
        if len(a.OnBuild) != len(b.OnBuild) {
                return false
        }

        for i := 0; i < len(a.Env); i++ {
                if a.Env[i] != b.Env[i] {
                        return false
                }
        }
        for i := 0; i < len(a.OnBuild); i++ {
                if a.OnBuild[i] != b.OnBuild[i] {
                        return false
                }
        }
        for i := 0; i < len(a.Cmd); i++ {
                if a.Cmd[i] != b.Cmd[i] {
                        return false
                }
        }
        for i := 0; i < len(a.Entrypoint); i++ {
                if a.Entrypoint[i] != b.Entrypoint[i] {
                        return false
                }
        }
        for i := 0; i < len(a.Shell); i++ {
                if a.Shell[i] != b.Shell[i] {
                        return false
                }
        }
        for k := range a.ExposedPorts {
                if _, exists := b.ExposedPorts[k]; !exists {
                        return false
                }
        }
        for key := range a.Volumes {
                if _, exists := b.Volumes[key]; !exists {
                        return false
                }
        }
        for k, v := range a.Labels {
                if v != b.Labels[k] {
                        return false
                }
        }

        if a.AttachStdin != b.AttachStdin {
                return false
        }
        if a.AttachStdout != b.AttachStdout {
                return false
        }
        if a.AttachStderr != b.AttachStderr {
                return false
        }
        if a.NetworkDisabled != b.NetworkDisabled {
                return false
        }
        if a.Tty != b.Tty {
                return false
        }
        if a.OpenStdin != b.OpenStdin {
                return false
        }
        if a.StdinOnce != b.StdinOnce {
                return false
        }
        if a.ArgsEscaped != b.ArgsEscaped {
                return false
        }
        if a.User != b.User {
                return false
        }
        if a.WorkingDir != b.WorkingDir {
                return false
        }
        if a.StopSignal != b.StopSignal {
                return false
        }

        if (a.StopTimeout == nil) != (b.StopTimeout == nil) {
                return false
        }
        if a.StopTimeout != nil && b.StopTimeout != nil {
                if *a.StopTimeout != *b.StopTimeout {
                        return false
                }
        }
        if (a.Healthcheck == nil) != (b.Healthcheck == nil) {
                return false
        }
        if a.Healthcheck != nil && b.Healthcheck != nil {
                if a.Healthcheck.Interval != b.Healthcheck.Interval {
                        return false
                }
                if a.Healthcheck.StartInterval != b.Healthcheck.StartInterval {
                        return false
                }
                if a.Healthcheck.StartPeriod != b.Healthcheck.StartPeriod {
                        return false
                }
                if a.Healthcheck.Timeout != b.Healthcheck.Timeout {
                        return false
                }
                if a.Healthcheck.Retries != b.Healthcheck.Retries {
                        return false
                }
                if len(a.Healthcheck.Test) != len(b.Healthcheck.Test) {
                        return false
                }
                for i := 0; i < len(a.Healthcheck.Test); i++ {
                        if a.Healthcheck.Test[i] != b.Healthcheck.Test[i] {
                                return false
                        }
                }
        }

        return true
}

package image

import (
        "context"
        "fmt"
        "os"
        "path/filepath"
        "sync"

        "github.com/containerd/log"
        "github.com/moby/sys/atomicwriter"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

// DigestWalkFunc is function called by StoreBackend.Walk
type DigestWalkFunc func(id digest.Digest) error

// StoreBackend provides interface for image.Store persistence
type StoreBackend interface {
        Walk(f DigestWalkFunc) error
        Get(id digest.Digest) ([]byte, error)
        Set(data []byte) (digest.Digest, error)
        Delete(id digest.Digest) error
        SetMetadata(id digest.Digest, key string, data []byte) error
        GetMetadata(id digest.Digest, key string) ([]byte, error)
        DeleteMetadata(id digest.Digest, key string) error
}

// fs implements StoreBackend using the filesystem.
type fs struct {
        sync.RWMutex
        root string
}

const (
        contentDirName  = "content"
        metadataDirName = "metadata"
)

// NewFSStoreBackend returns new filesystem based backend for image.Store
func NewFSStoreBackend(root string) (StoreBackend, error) {
        return newFSStore(root)
}

func newFSStore(root string) (*fs, error) {
        s := &fs{
                root: root,
        }
        if err := os.MkdirAll(filepath.Join(root, contentDirName, string(digest.Canonical)), 0o700); err != nil {
                return nil, errors.Wrap(err, "failed to create storage backend")
        }
        if err := os.MkdirAll(filepath.Join(root, metadataDirName, string(digest.Canonical)), 0o700); err != nil {
                return nil, errors.Wrap(err, "failed to create storage backend")
        }
        return s, nil
}

func (s *fs) contentFile(dgst digest.Digest) string {
        return filepath.Join(s.root, contentDirName, string(dgst.Algorithm()), dgst.Encoded())
}

func (s *fs) metadataDir(dgst digest.Digest) string {
        return filepath.Join(s.root, metadataDirName, string(dgst.Algorithm()), dgst.Encoded())
}

// Walk calls the supplied callback for each image ID in the storage backend.
func (s *fs) Walk(f DigestWalkFunc) error {
        // Only Canonical digest (sha256) is currently supported
        s.RLock()
        dir, err := os.ReadDir(filepath.Join(s.root, contentDirName, string(digest.Canonical)))
        s.RUnlock()
        if err != nil {
                return err
        }
        for _, v := range dir {
                dgst := digest.NewDigestFromEncoded(digest.Canonical, v.Name())
                if err := dgst.Validate(); err != nil {
                        log.G(context.TODO()).Debugf("skipping invalid digest %s: %s", dgst, err)
                        continue
                }
                if err := f(dgst); err != nil {
                        return err
                }
        }
        return nil
}

// Get returns the content stored under a given digest.
func (s *fs) Get(dgst digest.Digest) ([]byte, error) {
        s.RLock()
        defer s.RUnlock()

        return s.get(dgst)
}

func (s *fs) get(dgst digest.Digest) ([]byte, error) {
        content, err := os.ReadFile(s.contentFile(dgst))
        if err != nil {
                return nil, errors.Wrapf(err, "failed to get digest %s", dgst)
        }

        // todo: maybe optional
        if digest.FromBytes(content) != dgst {
                return nil, fmt.Errorf("failed to verify: %v", dgst)
        }

        return content, nil
}

// Set stores content by checksum.
func (s *fs) Set(data []byte) (digest.Digest, error) {
        s.Lock()
        defer s.Unlock()

        if len(data) == 0 {
                return "", errors.New("invalid empty data")
        }

        dgst := digest.FromBytes(data)
        if err := atomicwriter.WriteFile(s.contentFile(dgst), data, 0o600); err != nil {
                return "", errors.Wrap(err, "failed to write digest data")
        }

        return dgst, nil
}

// Delete removes content and metadata files associated with the digest.
func (s *fs) Delete(dgst digest.Digest) error {
        s.Lock()
        defer s.Unlock()

        if err := os.RemoveAll(s.metadataDir(dgst)); err != nil {
                return err
        }
        return os.Remove(s.contentFile(dgst))
}

// SetMetadata sets metadata for a given ID. It fails if there's no base file.
func (s *fs) SetMetadata(dgst digest.Digest, key string, data []byte) error {
        s.Lock()
        defer s.Unlock()
        if _, err := s.get(dgst); err != nil {
                return err
        }

        baseDir := s.metadataDir(dgst)
        if err := os.MkdirAll(baseDir, 0o700); err != nil {
                return err
        }
        return atomicwriter.WriteFile(filepath.Join(baseDir, key), data, 0o600)
}

// GetMetadata returns metadata for a given digest.
func (s *fs) GetMetadata(dgst digest.Digest, key string) ([]byte, error) {
        s.RLock()
        defer s.RUnlock()

        if _, err := s.get(dgst); err != nil {
                return nil, err
        }
        bytes, err := os.ReadFile(filepath.Join(s.metadataDir(dgst), key))
        if err != nil {
                return nil, errors.Wrap(err, "failed to read metadata")
        }
        return bytes, nil
}

// DeleteMetadata removes the metadata associated with a digest.
func (s *fs) DeleteMetadata(dgst digest.Digest, key string) error {
        s.Lock()
        defer s.Unlock()

        return os.RemoveAll(filepath.Join(s.metadataDir(dgst), key))
}

package image

import (
        "context"
        "encoding/json"
        "errors"
        "io"
        "runtime"
        "strings"
        "time"

        "github.com/docker/docker/dockerversion"
        "github.com/docker/docker/layer"
        "github.com/moby/moby/api/types/container"
        "github.com/opencontainers/go-digest"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

// ID is the content-addressable ID of an image.
type ID digest.Digest

func (id ID) String() string {
        return id.Digest().String()
}

// Digest converts ID into a digest
func (id ID) Digest() digest.Digest {
        return digest.Digest(id)
}

// V1Image stores the V1 image configuration.
type V1Image struct {
        // ID is a unique 64 character identifier of the image
        ID string `json:"id,omitempty"`

        // Parent is the ID of the parent image.
        //
        // Depending on how the image was created, this field may be empty and
        // is only set for images that were built/created locally. This field
        // is empty if the image was pulled from an image registry.
        Parent string `json:"parent,omitempty"`

        // Comment is an optional message that can be set when committing or
        // importing the image.
        Comment string `json:"comment,omitempty"`

        // Created is the timestamp at which the image was created
        Created *time.Time `json:"created"`

        // Container is the ID of the container that was used to create the image.
        //
        // Depending on how the image was created, this field may be empty.
        Container string `json:"container,omitempty"`

        // ContainerConfig is the configuration of the container that was committed
        // into the image.
        ContainerConfig container.Config `json:"container_config,omitempty"`

        // DockerVersion is the version of Docker that was used to build the image.
        //
        // Depending on how the image was created, this field may be empty.
        DockerVersion string `json:"docker_version,omitempty"`

        // Author is the name of the author that was specified when committing the
        // image, or as specified through MAINTAINER (deprecated) in the Dockerfile.
        Author string `json:"author,omitempty"`

        // Config is the configuration of the container received from the client.
        Config *container.Config `json:"config,omitempty"`

        // Architecture is the hardware CPU architecture that the image runs on.
        Architecture string `json:"architecture,omitempty"`

        // Variant is the CPU architecture variant (presently ARM-only).
        Variant string `json:"variant,omitempty"`

        // OS is the Operating System the image is built to run on.
        OS string `json:"os,omitempty"`

        // Size is the total size of the image including all layers it is composed of.
        Size int64 `json:",omitempty"`
}

// Image stores the image configuration
type Image struct {
        V1Image

        // Parent is the ID of the parent image.
        //
        // Depending on how the image was created, this field may be empty and
        // is only set for images that were built/created locally. This field
        // is empty if the image was pulled from an image registry.
        Parent ID `json:"parent,omitempty"` //nolint:govet

        // RootFS contains information about the image's RootFS, including the
        // layer IDs.
        RootFS  *RootFS   `json:"rootfs,omitempty"`
        History []History `json:"history,omitempty"`

        // OsVersion is the version of the Operating System the image is built to
        // run on (especially for Windows).
        OSVersion  string   `json:"os.version,omitempty"`
        OSFeatures []string `json:"os.features,omitempty"`

        // rawJSON caches the immutable JSON associated with this image.
        rawJSON []byte

        // computedID is the ID computed from the hash of the image config.
        // Not to be confused with the legacy V1 ID in V1Image.
        computedID ID

        // Details holds additional details about image
        Details *Details `json:"-"`
}

// Details provides additional image data
type Details struct {
        // ManifestDescriptor is the descriptor of the platform-specific manifest
        // chosen by the [GetImage] call that returned this image.
        // The exact descriptor depends on the [GetImageOpts.Platform] field
        // passed to [GetImage] and the content availability.
        // This is only set by the containerd image service.
        ManifestDescriptor *ocispec.Descriptor
}

// RawJSON returns the immutable JSON associated with the image.
func (img *Image) RawJSON() []byte {
        return img.rawJSON
}

// ID returns the image's content-addressable ID.
func (img *Image) ID() ID {
        return img.computedID
}

// ImageID stringifies ID.
func (img *Image) ImageID() string {
        return img.ID().String()
}

// RunConfig returns the image's container config.
func (img *Image) RunConfig() *container.Config {
        return img.Config
}

// BaseImgArch returns the image's architecture. If not populated, defaults to the host runtime arch.
func (img *Image) BaseImgArch() string {
        arch := img.Architecture
        if arch == "" {
                arch = runtime.GOARCH
        }
        return arch
}

// BaseImgVariant returns the image's variant, whether populated or not.
// This avoids creating an inconsistency where the stored image variant
// is "greater than" (i.e. v8 vs v6) the actual image variant.
func (img *Image) BaseImgVariant() string {
        return img.Variant
}

// OperatingSystem returns the image's operating system. If not populated, defaults to the host runtime OS.
func (img *Image) OperatingSystem() string {
        os := img.OS
        if os == "" {
                os = runtime.GOOS
        }
        return os
}

// Platform generates an OCI platform from the image
func (img *Image) Platform() ocispec.Platform {
        return ocispec.Platform{
                Architecture: img.Architecture,
                OS:           img.OS,
                OSVersion:    img.OSVersion,
                OSFeatures:   img.OSFeatures,
                Variant:      img.Variant,
        }
}

// MarshalJSON serializes the image to JSON. It sorts the top-level keys so
// that JSON that's been manipulated by a push/pull cycle with a legacy
// registry won't end up with a different key order.
func (img *Image) MarshalJSON() ([]byte, error) {
        type MarshalImage Image

        pass1, err := json.Marshal(MarshalImage(*img))
        if err != nil {
                return nil, err
        }

        var c map[string]*json.RawMessage
        if err := json.Unmarshal(pass1, &c); err != nil {
                return nil, err
        }
        return json.Marshal(c)
}

// ChildConfig is the configuration to apply to an Image to create a new
// Child image. Other properties of the image are copied from the parent.
type ChildConfig struct {
        ContainerID     string
        Author          string
        Comment         string
        DiffID          layer.DiffID
        ContainerConfig *container.Config
        Config          *container.Config
}

// NewImage creates a new image with the given ID
func NewImage(id ID) *Image {
        return &Image{
                computedID: id,
        }
}

// NewChildImage creates a new Image as a child of this image.
func NewChildImage(img *Image, child ChildConfig, os string) *Image {
        isEmptyLayer := layer.IsEmpty(child.DiffID)
        var rootFS *RootFS
        if img.RootFS != nil {
                rootFS = img.RootFS.Clone()
        } else {
                rootFS = NewRootFS()
        }

        if !isEmptyLayer {
                rootFS.Append(child.DiffID)
        }
        imgHistory := NewHistory(
                child.Author,
                child.Comment,
                strings.Join(child.ContainerConfig.Cmd, " "),
                isEmptyLayer)

        return &Image{
                V1Image: V1Image{
                        DockerVersion:   dockerversion.Version,
                        Config:          child.Config,
                        Architecture:    img.BaseImgArch(),
                        Variant:         img.BaseImgVariant(),
                        OS:              os,
                        Container:       child.ContainerID,
                        ContainerConfig: *child.ContainerConfig,
                        Author:          child.Author,
                        Created:         imgHistory.Created,
                },
                RootFS:     rootFS,
                History:    append(img.History, imgHistory),
                OSFeatures: img.OSFeatures,
                OSVersion:  img.OSVersion,
        }
}

// Clone clones an image and changes ID.
func Clone(base *Image, id ID) *Image {
        img := *base
        img.RootFS = img.RootFS.Clone()
        img.V1Image.ID = id.String()
        img.computedID = id
        return &img
}

// History stores build commands that were used to create an image
type History = ocispec.History

// NewHistory creates a new history struct from arguments, and sets the created
// time to the current time in UTC
func NewHistory(author, comment, createdBy string, isEmptyLayer bool) History {
        now := time.Now().UTC()
        return History{
                Author:     author,
                Created:    &now,
                CreatedBy:  createdBy,
                Comment:    comment,
                EmptyLayer: isEmptyLayer,
        }
}

// Exporter provides interface for loading and saving images
type Exporter interface {
        Load(context.Context, io.ReadCloser, io.Writer, bool) error
        // TODO: Load(net.Context, io.ReadCloser, <- chan StatusMessage) error
        Save(context.Context, []string, io.Writer) error
}

// NewFromJSON creates an Image configuration from json.
func NewFromJSON(src []byte) (*Image, error) {
        img := &Image{}

        if err := json.Unmarshal(src, img); err != nil {
                return nil, err
        }
        if img.RootFS == nil {
                return nil, errors.New("invalid image JSON, no RootFS key")
        }

        img.rawJSON = src

        return img, nil
}

package image

import (
        "errors"
        "runtime"
        "strings"

        "github.com/docker/docker/errdefs"
)

// CheckOS checks if the given OS matches the host's platform, and
// returns an error otherwise.
func CheckOS(os string) error {
        if !strings.EqualFold(runtime.GOOS, os) {
                return errdefs.InvalidParameter(errors.New("operating system is not supported"))
        }
        return nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package image

import (
        "slices"

        "github.com/docker/docker/layer"
        "github.com/opencontainers/image-spec/identity"
)

// TypeLayers is used for RootFS.Type for filesystems organized into layers.
const TypeLayers = "layers"

// RootFS describes images root filesystem
// This is currently a placeholder that only supports layers. In the future
// this can be made into an interface that supports different implementations.
type RootFS struct {
        Type    string         `json:"type"`
        DiffIDs []layer.DiffID `json:"diff_ids,omitempty"`
}

// NewRootFS returns empty RootFS struct
func NewRootFS() *RootFS {
        return &RootFS{Type: TypeLayers}
}

// Append appends a new diffID to rootfs
func (r *RootFS) Append(id layer.DiffID) {
        r.DiffIDs = append(r.DiffIDs, id)
}

// Clone returns a copy of the RootFS
func (r *RootFS) Clone() *RootFS {
        return &RootFS{
                Type:    r.Type,
                DiffIDs: slices.Clone(r.DiffIDs),
        }
}

// ChainID returns the ChainID for the top layer in RootFS.
func (r *RootFS) ChainID() layer.ChainID {
        return identity.ChainID(r.DiffIDs)
}

package image

import (
        "context"
        "fmt"
        "os"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/errdefs"
        "github.com/docker/docker/layer"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/go-digest/digestset"
        "github.com/pkg/errors"
)

// Store is an interface for creating and accessing images
type Store interface {
        Create(config []byte) (ID, error)
        Get(id ID) (*Image, error)
        Delete(id ID) ([]layer.Metadata, error)
        Search(partialID string) (ID, error)
        SetParent(id ID, parent ID) error
        GetParent(id ID) (ID, error)
        SetLastUpdated(id ID) error
        GetLastUpdated(id ID) (time.Time, error)
        SetBuiltLocally(id ID) error
        IsBuiltLocally(id ID) (bool, error)
        Children(id ID) []ID
        Map() map[ID]*Image
        Heads() map[ID]*Image
        Len() int
}

// LayerGetReleaser is a minimal interface for getting and releasing images.
type LayerGetReleaser interface {
        Get(layer.ChainID) (layer.Layer, error)
        Release(layer.Layer) ([]layer.Metadata, error)
}

type imageMeta struct {
        layer    layer.Layer
        children map[ID]struct{}
}

type store struct {
        sync.RWMutex
        lss       LayerGetReleaser
        images    map[ID]*imageMeta
        fs        StoreBackend
        digestSet *digestset.Set
}

// NewImageStore returns new store object for given set of layer stores
func NewImageStore(fs StoreBackend, lss LayerGetReleaser) (Store, error) {
        is := &store{
                lss:       lss,
                images:    make(map[ID]*imageMeta),
                fs:        fs,
                digestSet: digestset.NewSet(),
        }

        // load all current images and retain layers
        if err := is.restore(); err != nil {
                return nil, err
        }

        return is, nil
}

func (is *store) restore() error {
        // As the code below is run when restoring all images (which can be "many"),
        // constructing the "log.G(ctx).WithFields" is deliberately not "DRY", as the
        // logger is only used for error-cases, and we don't want to do allocations
        // if we don't need it. The "f" type alias is here is just for convenience,
        // and to make the code _slightly_ more DRY. See the discussion on GitHub;
        // https://github.com/moby/moby/pull/44426#discussion_r1059519071
        type f = log.Fields
        err := is.fs.Walk(func(dgst digest.Digest) error {
                img, err := is.Get(ID(dgst))
                if err != nil {
                        log.G(context.TODO()).WithFields(f{"digest": dgst, "err": err}).Error("invalid image")
                        return nil
                }
                var l layer.Layer
                if chainID := img.RootFS.ChainID(); chainID != "" {
                        if err := CheckOS(img.OperatingSystem()); err != nil {
                                log.G(context.TODO()).WithFields(f{"chainID": chainID, "os": img.OperatingSystem()}).Error("not restoring image with unsupported operating system")
                                return nil
                        }
                        l, err = is.lss.Get(chainID)
                        if err != nil {
                                if errors.Is(err, layer.ErrLayerDoesNotExist) {
                                        log.G(context.TODO()).WithFields(f{"chainID": chainID, "os": img.OperatingSystem(), "err": err}).Error("not restoring image")
                                        return nil
                                }
                                return err
                        }
                }
                if err := is.digestSet.Add(dgst); err != nil {
                        return err
                }

                is.images[ID(dgst)] = &imageMeta{
                        layer:    l,
                        children: make(map[ID]struct{}),
                }

                return nil
        })
        if err != nil {
                return err
        }

        // Second pass to fill in children maps
        for id := range is.images {
                if parent, err := is.GetParent(id); err == nil {
                        if parentMeta := is.images[parent]; parentMeta != nil {
                                parentMeta.children[id] = struct{}{}
                        }
                }
        }

        return nil
}

func (is *store) Create(config []byte) (ID, error) {
        var img *Image
        img, err := NewFromJSON(config)
        if err != nil {
                return "", err
        }

        // Must reject any config that references diffIDs from the history
        // which aren't among the rootfs layers.
        rootFSLayers := make(map[layer.DiffID]struct{})
        for _, diffID := range img.RootFS.DiffIDs {
                rootFSLayers[diffID] = struct{}{}
        }

        layerCounter := 0
        for _, h := range img.History {
                if !h.EmptyLayer {
                        layerCounter++
                }
        }
        if layerCounter > len(img.RootFS.DiffIDs) {
                return "", errdefs.InvalidParameter(errors.New("too many non-empty layers in History section"))
        }

        imageDigest, err := is.fs.Set(config)
        if err != nil {
                return "", errdefs.InvalidParameter(err)
        }

        is.Lock()
        defer is.Unlock()

        imageID := ID(imageDigest)
        if _, exists := is.images[imageID]; exists {
                return imageID, nil
        }

        layerID := img.RootFS.ChainID()

        var l layer.Layer
        if layerID != "" {
                if err := CheckOS(img.OperatingSystem()); err != nil {
                        return "", err
                }
                l, err = is.lss.Get(layerID)
                if err != nil {
                        return "", errdefs.InvalidParameter(errors.Wrapf(err, "failed to get layer %s", layerID))
                }
        }

        is.images[imageID] = &imageMeta{
                layer:    l,
                children: make(map[ID]struct{}),
        }

        if err = is.digestSet.Add(imageDigest); err != nil {
                delete(is.images, imageID)
                return "", errdefs.InvalidParameter(err)
        }

        return imageID, nil
}

type imageNotFoundError string

func (e imageNotFoundError) Error() string {
        return "No such image: " + string(e)
}

func (imageNotFoundError) NotFound() {}

func (is *store) Search(term string) (ID, error) {
        dgst, err := is.digestSet.Lookup(term)
        if err != nil {
                if errors.Is(err, digestset.ErrDigestNotFound) {
                        err = imageNotFoundError(term)
                }
                return "", errors.WithStack(err)
        }
        return ID(dgst), nil
}

func (is *store) Get(id ID) (*Image, error) {
        // todo: Check if image is in images
        // todo: Detect manual insertions and start using them
        config, err := is.fs.Get(id.Digest())
        if err != nil {
                return nil, errdefs.NotFound(err)
        }

        img, err := NewFromJSON(config)
        if err != nil {
                return nil, errdefs.InvalidParameter(err)
        }
        img.computedID = id

        img.Parent, err = is.GetParent(id)
        if err != nil {
                img.Parent = ""
        }

        return img, nil
}

func (is *store) Delete(id ID) ([]layer.Metadata, error) {
        is.Lock()
        defer is.Unlock()

        imgMeta := is.images[id]
        if imgMeta == nil {
                return nil, errdefs.NotFound(fmt.Errorf("unrecognized image ID %s", id.String()))
        }
        _, err := is.Get(id)
        if err != nil {
                return nil, errdefs.NotFound(fmt.Errorf("unrecognized image %s, %v", id.String(), err))
        }
        for cID := range imgMeta.children {
                is.fs.DeleteMetadata(cID.Digest(), "parent")
        }
        if parent, err := is.GetParent(id); err == nil && is.images[parent] != nil {
                delete(is.images[parent].children, id)
        }

        if err := is.digestSet.Remove(id.Digest()); err != nil {
                log.G(context.TODO()).Errorf("error removing %s from digest set: %q", id, err)
        }
        delete(is.images, id)
        is.fs.Delete(id.Digest())

        if imgMeta.layer != nil {
                return is.lss.Release(imgMeta.layer)
        }
        return nil, nil
}

func (is *store) SetParent(id, parentID ID) error {
        is.Lock()
        defer is.Unlock()
        parentMeta := is.images[parentID]
        if parentMeta == nil {
                return errdefs.NotFound(fmt.Errorf("unknown parent image ID %s", parentID.String()))
        }
        if parent, err := is.GetParent(id); err == nil && is.images[parent] != nil {
                delete(is.images[parent].children, id)
        }
        parentMeta.children[id] = struct{}{}
        return is.fs.SetMetadata(id.Digest(), "parent", []byte(parentID))
}

func (is *store) GetParent(id ID) (ID, error) {
        d, err := is.fs.GetMetadata(id.Digest(), "parent")
        if err != nil {
                return "", errdefs.NotFound(err)
        }
        return ID(d), nil // todo: validate?
}

// SetLastUpdated time for the image ID to the current time
func (is *store) SetLastUpdated(id ID) error {
        lastUpdated := []byte(time.Now().Format(time.RFC3339Nano))
        return is.fs.SetMetadata(id.Digest(), "lastUpdated", lastUpdated)
}

// GetLastUpdated time for the image ID
func (is *store) GetLastUpdated(id ID) (time.Time, error) {
        bytes, err := is.fs.GetMetadata(id.Digest(), "lastUpdated")
        if err != nil || len(bytes) == 0 {
                // No lastUpdated time
                return time.Time{}, nil
        }
        return time.Parse(time.RFC3339Nano, string(bytes))
}

// SetBuiltLocally sets whether image can be used as a builder cache
func (is *store) SetBuiltLocally(id ID) error {
        return is.fs.SetMetadata(id.Digest(), "builtLocally", []byte{1})
}

// IsBuiltLocally returns whether image can be used as a builder cache
func (is *store) IsBuiltLocally(id ID) (bool, error) {
        bytes, err := is.fs.GetMetadata(id.Digest(), "builtLocally")
        if err != nil || len(bytes) == 0 {
                if errors.Is(err, os.ErrNotExist) {
                        err = nil
                }
                return false, err
        }
        return bytes[0] == 1, nil
}

func (is *store) Children(id ID) []ID {
        is.RLock()
        defer is.RUnlock()

        return is.children(id)
}

func (is *store) children(id ID) []ID {
        var ids []ID
        if is.images[id] != nil {
                for id := range is.images[id].children {
                        ids = append(ids, id)
                }
        }
        return ids
}

func (is *store) Heads() map[ID]*Image {
        return is.imagesMap(false)
}

func (is *store) Map() map[ID]*Image {
        return is.imagesMap(true)
}

func (is *store) imagesMap(all bool) map[ID]*Image {
        is.RLock()
        defer is.RUnlock()

        images := make(map[ID]*Image)

        for id := range is.images {
                if !all && len(is.children(id)) > 0 {
                        continue
                }
                img, err := is.Get(id)
                if err != nil {
                        log.G(context.TODO()).Errorf("invalid image access: %q, error: %q", id, err)
                        continue
                }
                images[id] = img
        }
        return images
}

func (is *store) Len() int {
        is.RLock()
        defer is.RUnlock()
        return len(is.images)
}

package tarexport

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "reflect"
        "runtime"

        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/docker/image"
        "github.com/docker/docker/internal/ioutils"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/progress"
        "github.com/docker/docker/pkg/streamformatter"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/go-archive/chrootarchive"
        "github.com/moby/go-archive/compression"
        "github.com/moby/moby/api/types/events"
        "github.com/moby/sys/sequential"
        "github.com/moby/sys/symlink"
        "github.com/opencontainers/go-digest"
)

func (l *tarexporter) Load(ctx context.Context, inTar io.ReadCloser, outStream io.Writer, quiet bool) (outErr error) {
        ctx, span := tracing.StartSpan(ctx, "tarexport.Load")
        defer span.End()
        defer func() {
                span.SetStatus(outErr)
        }()

        var progressOutput progress.Output
        if !quiet {
                progressOutput = streamformatter.NewJSONProgressOutput(outStream, false)
        }
        outStream = streamformatter.NewStdoutWriter(outStream)

        tmpDir, err := os.MkdirTemp("", "docker-import-")
        if err != nil {
                return err
        }
        defer os.RemoveAll(tmpDir)

        if err := untar(ctx, inTar, tmpDir); err != nil {
                return err
        }

        // read manifest, if no file then load in legacy mode
        manifestPath, err := safePath(tmpDir, manifestFileName)
        if err != nil {
                return err
        }
        manifestFile, err := os.Open(manifestPath)
        if err != nil {
                if os.IsNotExist(err) {
                        return fmt.Errorf("invalid archive: does not contain a %s", manifestFileName)
                }
                return fmt.Errorf("invalid archive: failed to load %s: %w", manifestFileName, err)
        }
        defer manifestFile.Close()

        var manifest []manifestItem
        if err := json.NewDecoder(manifestFile).Decode(&manifest); err != nil {
                return fmt.Errorf("invalid archive: failed to decode %s: %w", manifestFileName, err)
        }

        // a nil manifest usually indicates a bug, so don't just silently fail.
        // if someone really needs to pass an empty manifest, they can pass [].
        if manifest == nil {
                return errors.New("invalid manifest, manifest cannot be null (but can be [])")
        }

        var parentLinks []parentLink
        var imageIDsStr string
        var imageRefCount int

        for _, m := range manifest {
                select {
                case <-ctx.Done():
                        return ctx.Err()
                default:
                }
                configPath, err := safePath(tmpDir, m.Config)
                if err != nil {
                        return err
                }
                config, err := os.ReadFile(configPath)
                if err != nil {
                        return err
                }
                img, err := image.NewFromJSON(config)
                if err != nil {
                        return err
                }
                if err := image.CheckOS(img.OperatingSystem()); err != nil {
                        return fmt.Errorf("cannot load %s image on %s", img.OperatingSystem(), runtime.GOOS)
                }
                if l.platformMatcher != nil && !l.platformMatcher.Match(img.Platform()) {
                        continue
                }
                rootFS := *img.RootFS
                rootFS.DiffIDs = nil

                if expected, actual := len(m.Layers), len(img.RootFS.DiffIDs); expected != actual {
                        return fmt.Errorf("invalid manifest, layers length mismatch: expected %d, got %d", expected, actual)
                }

                for i, diffID := range img.RootFS.DiffIDs {
                        select {
                        case <-ctx.Done():
                                return ctx.Err()
                        default:
                        }
                        layerPath, err := safePath(tmpDir, m.Layers[i])
                        if err != nil {
                                return err
                        }
                        r := rootFS
                        r.Append(diffID)
                        newLayer, err := l.lss.Get(r.ChainID())
                        if err != nil {
                                newLayer, err = l.loadLayer(ctx, layerPath, rootFS, diffID.String(), m.LayerSources[diffID], progressOutput)
                                if err != nil {
                                        return err
                                }
                        }
                        defer layer.ReleaseAndLog(l.lss, newLayer)
                        if expected, actual := diffID, newLayer.DiffID(); expected != actual {
                                return fmt.Errorf("invalid diffID for layer %d: expected %q, got %q", i, expected, actual)
                        }
                        rootFS.Append(diffID)
                }

                imgID, err := l.is.Create(config)
                if err != nil {
                        return err
                }
                imageIDsStr += fmt.Sprintf("Loaded image ID: %s\n", imgID)

                imageRefCount = 0
                for _, repoTag := range m.RepoTags {
                        named, err := reference.ParseNormalizedNamed(repoTag)
                        if err != nil {
                                return err
                        }
                        ref, ok := named.(reference.NamedTagged)
                        if !ok {
                                return fmt.Errorf("invalid tag %q", repoTag)
                        }
                        l.setLoadedTag(ref, imgID.Digest(), outStream)
                        fmt.Fprintf(outStream, "Loaded image: %s\n", reference.FamiliarString(ref))
                        imageRefCount++
                }

                parentLinks = append(parentLinks, parentLink{imgID, m.Parent})
                l.loggerImgEvent.LogImageEvent(ctx, imgID.String(), imgID.String(), events.ActionLoad)
        }

        for _, p := range validatedParentLinks(parentLinks) {
                if p.parentID != "" {
                        if err := l.setParentID(p.id, p.parentID); err != nil {
                                return err
                        }
                }
        }

        if imageRefCount == 0 {
                outStream.Write([]byte(imageIDsStr))
        }

        return nil
}

func untar(ctx context.Context, inTar io.ReadCloser, tmpDir string) error {
        _, trace := tracing.StartSpan(ctx, "chrootarchive.Untar")
        defer trace.End()

        err := chrootarchive.Untar(ioutils.NewCtxReader(ctx, inTar), tmpDir, nil)
        trace.SetStatus(err)
        return err
}

func (l *tarexporter) setParentID(id, parentID image.ID) error {
        img, err := l.is.Get(id)
        if err != nil {
                return err
        }
        parent, err := l.is.Get(parentID)
        if err != nil {
                return err
        }
        if !checkValidParent(img, parent) {
                return fmt.Errorf("image %v is not a valid parent for %v", parent.ID(), img.ID())
        }
        return l.is.SetParent(id, parentID)
}

func (l *tarexporter) loadLayer(ctx context.Context, filename string, rootFS image.RootFS, id string, foreignSrc distribution.Descriptor, progressOutput progress.Output) (_ layer.Layer, outErr error) {
        ctx, span := tracing.StartSpan(ctx, "loadLayer")
        span.SetAttributes(tracing.Attribute("image.id", id))
        defer span.End()
        defer func() {
                span.SetStatus(outErr)
        }()

        // We use sequential file access to avoid depleting the standby list on Windows.
        // On Linux, this equates to a regular os.Open.
        rawTar, err := sequential.Open(filename)
        if err != nil {
                log.G(context.TODO()).Debugf("Error reading embedded tar: %v", err)
                return nil, err
        }
        defer rawTar.Close()

        var r io.Reader
        if progressOutput != nil {
                fileInfo, err := rawTar.Stat()
                if err != nil {
                        log.G(context.TODO()).Debugf("Error statting file: %v", err)
                        return nil, err
                }

                r = progress.NewProgressReader(rawTar, progressOutput, fileInfo.Size(), stringid.TruncateID(id), "Loading layer")
        } else {
                r = rawTar
        }

        inflatedLayerData, err := compression.DecompressStream(ioutils.NewCtxReader(ctx, r))
        if err != nil {
                return nil, err
        }
        defer inflatedLayerData.Close()

        if ds, ok := l.lss.(layer.DescribableStore); ok {
                return ds.RegisterWithDescriptor(inflatedLayerData, rootFS.ChainID(), foreignSrc)
        }
        return l.lss.Register(inflatedLayerData, rootFS.ChainID())
}

func (l *tarexporter) setLoadedTag(ref reference.Named, imgID digest.Digest, outStream io.Writer) error {
        if prevID, err := l.rs.Get(ref); err == nil && prevID != imgID {
                fmt.Fprintf(outStream, "The image %s already exists, renaming the old one with ID %s to empty string\n", reference.FamiliarString(ref), string(prevID)) // todo: this message is wrong in case of multiple tags
        }

        return l.rs.AddTag(ref, imgID, true)
}

func safePath(base, path string) (string, error) {
        return symlink.FollowSymlinkInScope(filepath.Join(base, path), base)
}

type parentLink struct {
        id, parentID image.ID
}

func validatedParentLinks(pl []parentLink) (ret []parentLink) {
mainloop:
        for i, p := range pl {
                ret = append(ret, p)
                for _, p2 := range pl {
                        if p2.id == p.parentID && p2.id != p.id {
                                continue mainloop
                        }
                }
                ret[i].parentID = ""
        }
        return ret
}

func checkValidParent(img, parent *image.Image) bool {
        if len(img.History) == 0 && len(parent.History) == 0 {
                return true // having history is not mandatory
        }
        if len(img.History)-len(parent.History) != 1 {
                return false
        }
        for i, hP := range parent.History {
                hC := img.History[i]
                if (hP.Created == nil) != (hC.Created == nil) {
                        return false
                }
                if hP.Created != nil && !hP.Created.Equal(*hC.Created) {
                        return false
                }
                hC.Created = hP.Created
                if !reflect.DeepEqual(hP, hC) {
                        return false
                }
        }
        return true
}

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Code in this file is a modified version of go stdlib;
// https://cs.opensource.google/go/go/+/refs/tags/go1.23.4:src/os/path.go;l=19-66

package tarexport

import (
        "fmt"
        "os"
        "path/filepath"
        "syscall"
        "time"

        "github.com/docker/docker/pkg/system"
)

// mkdirAllWithChtimes is nearly an identical copy to the [os.MkdirAll] but
// tracks created directories and applies the provided mtime and atime using
// [system.Chtimes].
func mkdirAllWithChtimes(path string, perm os.FileMode, atime, mtime time.Time) error {
        // Fast path: if we can tell whether path is a directory or file, stop with success or error.
        dir, err := os.Stat(path)
        if err == nil {
                if dir.IsDir() {
                        return nil
                }
                return &os.PathError{Op: "mkdir", Path: path, Err: syscall.ENOTDIR}
        }

        // Slow path: make sure parent exists and then call Mkdir for path.

        // Extract the parent folder from path by first removing any trailing
        // path separator and then scanning backward until finding a path
        // separator or reaching the beginning of the string.
        i := len(path) - 1
        for i >= 0 && os.IsPathSeparator(path[i]) {
                i--
        }
        for i >= 0 && !os.IsPathSeparator(path[i]) {
                i--
        }
        if i < 0 {
                i = 0
        }

        // If there is a parent directory, and it is not the volume name,
        // recurse to ensure parent directory exists.
        if parent := path[:i]; len(parent) > len(filepath.VolumeName(path)) {
                err = mkdirAllWithChtimes(parent, perm, atime, mtime)
                if err != nil {
                        return err
                }
        }

        // Parent now exists; invoke Mkdir and use its result.
        err = os.Mkdir(path, perm)
        if err != nil {
                // Handle arguments like "foo/." by
                // double-checking that directory doesn't exist.
                dir, err1 := os.Lstat(path)
                if err1 == nil && dir.IsDir() {
                        return nil
                }
                return err
        }

        if err := system.Chtimes(path, atime, mtime); err != nil {
                return fmt.Errorf("applying atime=%v and mtime=%v: %w", atime, mtime, err)
        }
        return nil
}

package tarexport

import (
        "context"
        "encoding/json"
        "fmt"
        "io"
        "os"
        "path"
        "path/filepath"
        "time"

        c8dimages "github.com/containerd/containerd/v2/core/images"
        "github.com/containerd/containerd/v2/pkg/tracing"
        "github.com/containerd/log"
        "github.com/containerd/platforms"
        "github.com/distribution/reference"
        "github.com/docker/distribution"
        "github.com/docker/docker/image"
        v1 "github.com/docker/docker/image/v1"
        "github.com/docker/docker/internal/ioutils"
        "github.com/docker/docker/layer"
        "github.com/docker/docker/pkg/system"
        "github.com/moby/go-archive"
        "github.com/moby/moby/api/types/events"
        "github.com/moby/sys/sequential"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/specs-go"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
        "github.com/pkg/errors"
)

type imageDescriptor struct {
        refs     []reference.NamedTagged
        layers   []layer.DiffID
        image    *image.Image
        layerRef layer.Layer
}

type saveSession struct {
        *tarexporter
        outDir       string
        images       map[image.ID]*imageDescriptor
        savedLayers  map[layer.DiffID]distribution.Descriptor
        savedConfigs map[string]struct{}
}

func (l *tarexporter) Save(ctx context.Context, names []string, outStream io.Writer) error {
        imgDescriptors, err := l.parseNames(ctx, names)
        if err != nil {
                return err
        }

        // Release all the image top layer references
        defer l.releaseLayerReferences(imgDescriptors)
        return (&saveSession{tarexporter: l, images: imgDescriptors}).save(ctx, outStream)
}

// parseNames will parse the image names to a map which contains image.ID to *imageDescriptor.
// Each imageDescriptor holds an image top layer reference named 'layerRef'. It is taken here, should be released later.
func (l *tarexporter) parseNames(ctx context.Context, names []string) (desc map[image.ID]*imageDescriptor, rErr error) {
        imgDescr := make(map[image.ID]*imageDescriptor)
        defer func() {
                if rErr != nil {
                        l.releaseLayerReferences(imgDescr)
                }
        }()

        addAssoc := func(id image.ID, ref reference.Named) error {
                if _, ok := imgDescr[id]; !ok {
                        descr := &imageDescriptor{}
                        if err := l.takeLayerReference(id, descr); err != nil {
                                return err
                        }
                        imgDescr[id] = descr
                }

                if ref != nil {
                        if _, ok := ref.(reference.Canonical); ok {
                                return nil
                        }
                        tagged, ok := reference.TagNameOnly(ref).(reference.NamedTagged)
                        if !ok {
                                return nil
                        }

                        for _, t := range imgDescr[id].refs {
                                if tagged.String() == t.String() {
                                        return nil
                                }
                        }
                        imgDescr[id].refs = append(imgDescr[id].refs, tagged)
                }
                return nil
        }

        for _, name := range names {
                select {
                case <-ctx.Done():
                        return nil, ctx.Err()
                default:
                }

                ref, err := reference.ParseAnyReference(name)
                if err != nil {
                        return nil, err
                }
                namedRef, ok := ref.(reference.Named)
                if !ok {
                        // Check if digest ID reference
                        if digested, ok := ref.(reference.Digested); ok {
                                if err := addAssoc(image.ID(digested.Digest()), nil); err != nil {
                                        return nil, err
                                }
                                continue
                        }
                        return nil, errors.Errorf("invalid reference: %v", name)
                }

                if reference.FamiliarName(namedRef) == string(digest.Canonical) {
                        imgID, err := l.is.Search(name)
                        if err != nil {
                                return nil, err
                        }
                        if err := addAssoc(imgID, nil); err != nil {
                                return nil, err
                        }
                        continue
                }
                if reference.IsNameOnly(namedRef) {
                        assocs := l.rs.ReferencesByName(namedRef)
                        for _, assoc := range assocs {
                                if err := addAssoc(image.ID(assoc.ID), assoc.Ref); err != nil {
                                        return nil, err
                                }
                        }
                        if len(assocs) == 0 {
                                imgID, err := l.is.Search(name)
                                if err != nil {
                                        return nil, err
                                }
                                if err := addAssoc(imgID, nil); err != nil {
                                        return nil, err
                                }
                        }
                        continue
                }
                id, err := l.rs.Get(namedRef)
                if err != nil {
                        return nil, err
                }
                if err := addAssoc(image.ID(id), namedRef); err != nil {
                        return nil, err
                }
        }
        return imgDescr, nil
}

// takeLayerReference will take/Get the image top layer reference
func (l *tarexporter) takeLayerReference(id image.ID, imgDescr *imageDescriptor) error {
        img, err := l.is.Get(id)
        if err != nil {
                return err
        }
        if err := image.CheckOS(img.OperatingSystem()); err != nil {
                return fmt.Errorf("os %q is not supported", img.OperatingSystem())
        }
        if l.platform != nil {
                if !l.platformMatcher.Match(img.Platform()) {
                        return errors.New("no suitable export target found for platform " + platforms.FormatAll(*l.platform))
                }
        }
        imgDescr.image = img
        topLayerID := img.RootFS.ChainID()
        if topLayerID == "" {
                return nil
        }
        topLayer, err := l.lss.Get(topLayerID)
        if err != nil {
                return err
        }
        imgDescr.layerRef = topLayer
        return nil
}

// releaseLayerReferences will release all the image top layer references
func (l *tarexporter) releaseLayerReferences(imgDescr map[image.ID]*imageDescriptor) error {
        for _, descr := range imgDescr {
                if descr.layerRef != nil {
                        l.lss.Release(descr.layerRef)
                }
        }
        return nil
}

func (s *saveSession) save(ctx context.Context, outStream io.Writer) error {
        s.savedConfigs = make(map[string]struct{})
        s.savedLayers = make(map[layer.DiffID]distribution.Descriptor)

        // get image json
        tempDir, err := os.MkdirTemp("", "docker-export-")
        if err != nil {
                return err
        }
        defer os.RemoveAll(tempDir)

        s.outDir = tempDir
        reposLegacy := make(map[string]map[string]string)

        var manifest []manifestItem
        var parentLinks []parentLink

        var manifestDescriptors []ocispec.Descriptor

        for id, imageDescr := range s.images {
                select {
                case <-ctx.Done():
                        return ctx.Err()
                default:
                }

                foreignSrcs, err := s.saveImage(ctx, id)
                if err != nil {
                        return err
                }

                var (
                        repoTags []string
                        layers   []string
                        foreign  = make([]ocispec.Descriptor, 0, len(foreignSrcs))
                )

                // Layers in manifest must follow the actual layer order from config.
                for _, l := range imageDescr.layers {
                        desc := foreignSrcs[l]
                        foreign = append(foreign, ocispec.Descriptor{
                                MediaType:   desc.MediaType,
                                Digest:      desc.Digest,
                                Size:        desc.Size,
                                URLs:        desc.URLs,
                                Annotations: desc.Annotations,
                                Platform:    desc.Platform,
                        })
                }

                data, err := json.Marshal(ocispec.Manifest{
                        Versioned: specs.Versioned{
                                SchemaVersion: 2,
                        },
                        MediaType: ocispec.MediaTypeImageManifest,
                        Config: ocispec.Descriptor{
                                MediaType: ocispec.MediaTypeImageConfig,
                                Digest:    digest.Digest(imageDescr.image.ID()),
                                Size:      int64(len(imageDescr.image.RawJSON())),
                        },
                        Layers: foreign,
                })
                if err != nil {
                        return errors.Wrap(err, "error marshaling manifest")
                }
                dgst := digest.FromBytes(data)

                mFile := filepath.Join(s.outDir, ocispec.ImageBlobsDir, dgst.Algorithm().String(), dgst.Encoded())
                if err := mkdirAllWithChtimes(filepath.Dir(mFile), 0o755, time.Unix(0, 0), time.Unix(0, 0)); err != nil {
                        return errors.Wrap(err, "error creating blob directory")
                }
                if err := system.Chtimes(filepath.Dir(mFile), time.Unix(0, 0), time.Unix(0, 0)); err != nil {
                        return errors.Wrap(err, "error setting blob directory timestamps")
                }
                if err := os.WriteFile(mFile, data, 0o644); err != nil {
                        return errors.Wrap(err, "error writing oci manifest file")
                }
                if err := system.Chtimes(mFile, time.Unix(0, 0), time.Unix(0, 0)); err != nil {
                        return errors.Wrap(err, "error setting blob directory timestamps")
                }

                untaggedMfstDesc := ocispec.Descriptor{
                        MediaType: ocispec.MediaTypeImageManifest,
                        Digest:    dgst,
                        Size:      int64(len(data)),
                }
                for _, ref := range imageDescr.refs {
                        familiarName := reference.FamiliarName(ref)
                        if _, ok := reposLegacy[familiarName]; !ok {
                                reposLegacy[familiarName] = make(map[string]string)
                        }
                        reposLegacy[familiarName][ref.Tag()] = imageDescr.layers[len(imageDescr.layers)-1].Encoded()
                        repoTags = append(repoTags, reference.FamiliarString(ref))

                        taggedManifest := untaggedMfstDesc
                        taggedManifest.Annotations = map[string]string{
                                c8dimages.AnnotationImageName: ref.String(),
                                ocispec.AnnotationRefName:     ref.Tag(),
                        }
                        manifestDescriptors = append(manifestDescriptors, taggedManifest)
                }

                // If no ref was assigned, make sure still add the image is still included in index.json.
                if len(manifestDescriptors) == 0 {
                        manifestDescriptors = append(manifestDescriptors, untaggedMfstDesc)
                }

                for _, lDgst := range imageDescr.layers {
                        // IMPORTANT: We use path, not filepath here to ensure the layers
                        // in the manifest use Unix-style forward-slashes.
                        layers = append(layers, path.Join(ocispec.ImageBlobsDir, lDgst.Algorithm().String(), lDgst.Encoded()))
                }

                manifest = append(manifest, manifestItem{
                        Config:       path.Join(ocispec.ImageBlobsDir, id.Digest().Algorithm().String(), id.Digest().Encoded()),
                        RepoTags:     repoTags,
                        Layers:       layers,
                        LayerSources: foreignSrcs,
                })

                parentID, _ := s.is.GetParent(id)
                parentLinks = append(parentLinks, parentLink{id, parentID})
                s.tarexporter.loggerImgEvent.LogImageEvent(ctx, id.String(), id.String(), events.ActionSave)
        }

        for i, p := range validatedParentLinks(parentLinks) {
                if p.parentID != "" {
                        manifest[i].Parent = p.parentID
                }
        }

        if len(reposLegacy) > 0 {
                reposFile := filepath.Join(tempDir, legacyRepositoriesFileName)
                rf, err := os.OpenFile(reposFile, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644)
                if err != nil {
                        return err
                }

                if err := json.NewEncoder(rf).Encode(reposLegacy); err != nil {
                        rf.Close()
                        return err
                }

                rf.Close()

                if err := system.Chtimes(reposFile, time.Unix(0, 0), time.Unix(0, 0)); err != nil {
                        return err
                }
        }

        manifestPath := filepath.Join(tempDir, manifestFileName)
        f, err := os.OpenFile(manifestPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0o644)
        if err != nil {
                return err
        }

        if err := json.NewEncoder(f).Encode(manifest); err != nil {
                f.Close()
                return err
        }

        f.Close()

        if err := system.Chtimes(manifestPath, time.Unix(0, 0), time.Unix(0, 0)); err != nil {
                return err
        }

        const ociLayoutContent = `{"imageLayoutVersion": "` + ocispec.ImageLayoutVersion + `"}`
        layoutPath := filepath.Join(tempDir, ocispec.ImageLayoutFile)
        if err := os.WriteFile(layoutPath, []byte(ociLayoutContent), 0o644); err != nil {
                return errors.Wrap(err, "error writing oci layout file")
        }
        if err := system.Chtimes(layoutPath, time.Unix(0, 0), time.Unix(0, 0)); err != nil {
                return errors.Wrap(err, "error setting oci layout file timestamps")
        }

        data, err := json.Marshal(ocispec.Index{
                Versioned: specs.Versioned{
                        SchemaVersion: 2,
                },
                MediaType: ocispec.MediaTypeImageIndex,
                Manifests: manifestDescriptors,
        })
        if err != nil {
                return errors.Wrap(err, "error marshaling oci index")
        }

        idxFile := filepath.Join(s.outDir, ocispec.ImageIndexFile)
        if err := os.WriteFile(idxFile, data, 0o644); err != nil {
                return errors.Wrap(err, "error writing oci index file")
        }
        if err := system.Chtimes(idxFile, time.Unix(0, 0), time.Unix(0, 0)); err != nil {
                return errors.Wrap(err, "error setting oci index file timestamps")
        }

        return s.writeTar(ctx, tempDir, outStream)
}

func (s *saveSession) writeTar(ctx context.Context, tempDir string, outStream io.Writer) error {
        ctx, span := tracing.StartSpan(ctx, "writeTar")
        defer span.End()

        fs, err := archive.Tar(tempDir, archive.Uncompressed)
        if err != nil {
                span.SetStatus(err)
                return err
        }
        defer fs.Close()

        _, err = ioutils.CopyCtx(ctx, outStream, fs)

        span.SetStatus(err)
        return err
}

func (s *saveSession) saveImage(ctx context.Context, id image.ID) (_ map[layer.DiffID]distribution.Descriptor, outErr error) {
        ctx, span := tracing.StartSpan(ctx, "saveImage")
        span.SetAttributes(tracing.Attribute("image.id", id.String()))
        defer span.End()
        defer func() {
                span.SetStatus(outErr)
        }()

        img := s.images[id].image
        if len(img.RootFS.DiffIDs) == 0 {
                return nil, errors.New("empty export - not implemented")
        }

        ts := time.Unix(0, 0)
        if img.Created != nil {
                ts = *img.Created
        }

        var parent digest.Digest
        var layers []layer.DiffID
        var foreignSrcs map[layer.DiffID]distribution.Descriptor
        for i, diffID := range img.RootFS.DiffIDs {
                select {
                case <-ctx.Done():
                        return nil, ctx.Err()
                default:
                }
                v1ImgCreated := time.Unix(0, 0)
                v1Img := image.V1Image{
                        // This is for backward compatibility used for
                        // pre v1.9 docker.
                        Created: &v1ImgCreated,
                }
                if i == len(img.RootFS.DiffIDs)-1 {
                        v1Img = img.V1Image
                }
                rootFS := *img.RootFS
                rootFS.DiffIDs = rootFS.DiffIDs[:i+1]
                v1ID, err := v1.CreateID(v1Img, rootFS.ChainID(), parent)
                if err != nil {
                        return nil, err
                }

                v1Img.ID = v1ID.Encoded()
                if parent != "" {
                        v1Img.Parent = parent.Encoded()
                }

                v1Img.OS = img.OS
                src, err := s.saveConfigAndLayer(ctx, rootFS.ChainID(), v1Img, &ts)
                if err != nil {
                        return nil, err
                }

                layers = append(layers, diffID)
                parent = v1ID
                if src.Digest != "" {
                        if foreignSrcs == nil {
                                foreignSrcs = make(map[layer.DiffID]distribution.Descriptor)
                        }
                        foreignSrcs[img.RootFS.DiffIDs[i]] = src
                }
        }

        data := img.RawJSON()
        dgst := digest.FromBytes(data)

        blobDir := filepath.Join(s.outDir, ocispec.ImageBlobsDir, dgst.Algorithm().String())
        if err := mkdirAllWithChtimes(blobDir, 0o755, ts, ts); err != nil {
                return nil, err
        }
        if err := system.Chtimes(blobDir, ts, ts); err != nil {
                return nil, err
        }
        if err := system.Chtimes(filepath.Dir(blobDir), ts, ts); err != nil {
                return nil, err
        }

        configFile := filepath.Join(blobDir, dgst.Encoded())
        if err := os.WriteFile(configFile, img.RawJSON(), 0o644); err != nil {
                return nil, err
        }
        if err := system.Chtimes(configFile, ts, ts); err != nil {
                return nil, err
        }

        s.images[id].layers = layers
        return foreignSrcs, nil
}

func (s *saveSession) saveConfigAndLayer(ctx context.Context, id layer.ChainID, legacyImg image.V1Image, createdTime *time.Time) (_ distribution.Descriptor, outErr error) {
        ctx, span := tracing.StartSpan(ctx, "saveConfigAndLayer")
        span.SetAttributes(
                tracing.Attribute("layer.id", id.String()),
                tracing.Attribute("image.id", legacyImg.ID),
        )
        defer span.End()
        defer func() {
                span.SetStatus(outErr)
        }()

        ts := time.Unix(0, 0)
        if createdTime != nil {
                ts = *createdTime
        }

        outDir := filepath.Join(s.outDir, ocispec.ImageBlobsDir)

        if _, ok := s.savedConfigs[legacyImg.ID]; !ok {
                if err := s.saveConfig(legacyImg, outDir, createdTime); err != nil {
                        return distribution.Descriptor{}, err
                }
        }

        // serialize filesystem
        l, err := s.lss.Get(id)
        if err != nil {
                return distribution.Descriptor{}, err
        }

        lDiffID := l.DiffID()
        lDgst := lDiffID
        if _, ok := s.savedLayers[lDiffID]; ok {
                return s.savedLayers[lDiffID], nil
        }
        layerPath := filepath.Join(outDir, lDiffID.Algorithm().String(), lDiffID.Encoded())
        defer layer.ReleaseAndLog(s.lss, l)

        if _, err = os.Stat(layerPath); err == nil {
                // This is should not happen. If the layer path was already created, we should have returned early.
                // Log a warning an proceed to recreate the archive.
                log.G(context.TODO()).WithFields(log.Fields{
                        "layerPath": layerPath,
                        "id":        id,
                        "lDgst":     lDgst,
                }).Warn("LayerPath already exists but the descriptor is not cached")
        } else if !os.IsNotExist(err) {
                return distribution.Descriptor{}, err
        }

        // We use sequential file access to avoid depleting the standby list on
        // Windows. On Linux, this equates to a regular os.Create.
        if err := mkdirAllWithChtimes(filepath.Dir(layerPath), 0o755, ts, ts); err != nil {
                return distribution.Descriptor{}, errors.Wrap(err, "could not create layer dir parent")
        }
        tarFile, err := sequential.Create(layerPath)
        if err != nil {
                return distribution.Descriptor{}, errors.Wrap(err, "error creating layer file")
        }
        defer tarFile.Close()

        arch, err := l.TarStream()
        if err != nil {
                return distribution.Descriptor{}, err
        }
        defer arch.Close()

        digester := digest.Canonical.Digester()
        digestedArch := io.TeeReader(arch, digester.Hash())

        tarSize, err := ioutils.CopyCtx(ctx, tarFile, digestedArch)
        if err != nil {
                return distribution.Descriptor{}, err
        }

        tarDigest := digester.Digest()
        if lDgst != tarDigest {
                log.G(context.TODO()).WithFields(log.Fields{
                        "layerDigest":  lDgst,
                        "actualDigest": tarDigest,
                }).Warn("layer digest doesn't match its tar archive digest")

                lDgst = digester.Digest()
                layerPath = filepath.Join(outDir, lDgst.Algorithm().String(), lDgst.Encoded())
        }

        for _, fname := range []string{outDir, layerPath} {
                // todo: maybe save layer created timestamp?
                if err := system.Chtimes(fname, ts, ts); err != nil {
                        return distribution.Descriptor{}, errors.Wrap(err, "could not set layer timestamp")
                }
        }

        var desc distribution.Descriptor
        if fs, ok := l.(distribution.Describable); ok {
                desc = fs.Descriptor()
        }

        if desc.Digest == "" {
                desc.Digest = tarDigest
                desc.Size = tarSize
        }
        if desc.MediaType == "" {
                desc.MediaType = ocispec.MediaTypeImageLayer
        }
        s.savedLayers[lDiffID] = desc

        return desc, nil
}

func (s *saveSession) saveConfig(legacyImg image.V1Image, outDir string, createdTime *time.Time) error {
        imageConfig, err := json.Marshal(legacyImg)
        if err != nil {
                return err
        }

        ts := time.Unix(0, 0)
        if createdTime != nil {
                ts = *createdTime
        }

        cfgDgst := digest.FromBytes(imageConfig)
        configPath := filepath.Join(outDir, cfgDgst.Algorithm().String(), cfgDgst.Encoded())
        if err := mkdirAllWithChtimes(filepath.Dir(configPath), 0o755, ts, ts); err != nil {
                return errors.Wrap(err, "could not create layer dir parent")
        }

        if err := os.WriteFile(configPath, imageConfig, 0o644); err != nil {
                return err
        }

        if err := system.Chtimes(configPath, ts, ts); err != nil {
                return errors.Wrap(err, "could not set config timestamp")
        }

        s.savedConfigs[legacyImg.ID] = struct{}{}
        return nil
}

package tarexport

import (
        "context"

        "github.com/containerd/platforms"
        "github.com/docker/distribution"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        refstore "github.com/docker/docker/reference"
        "github.com/moby/moby/api/types/events"
        ocispec "github.com/opencontainers/image-spec/specs-go/v1"
)

const (
        manifestFileName           = "manifest.json"
        legacyRepositoriesFileName = "repositories"
)

type manifestItem struct {
        Config       string
        RepoTags     []string
        Layers       []string
        Parent       image.ID                                 `json:",omitempty"`
        LayerSources map[layer.DiffID]distribution.Descriptor `json:",omitempty"`
}

type tarexporter struct {
        is              image.Store
        lss             layer.Store
        rs              refstore.Store
        loggerImgEvent  LogImageEvent
        platform        *platforms.Platform
        platformMatcher platforms.Matcher
}

// LogImageEvent defines interface for event generation related to image tar(load and save) operations
type LogImageEvent interface {
        // LogImageEvent generates an event related to an image operation
        LogImageEvent(ctx context.Context, imageID, refName string, action events.Action)
}

// NewTarExporter returns new Exporter for tar packages
func NewTarExporter(is image.Store, lss layer.Store, rs refstore.Store, loggerImgEvent LogImageEvent, platform *ocispec.Platform) image.Exporter {
        l := &tarexporter{
                is:             is,
                lss:            lss,
                rs:             rs,
                loggerImgEvent: loggerImgEvent,
                platform:       platform,
        }
        if platform != nil {
                l.platformMatcher = platforms.OnlyStrict(*platform)
        }
        return l
}

package v1

import (
        "context"
        "encoding/json"

        "github.com/containerd/log"
        "github.com/docker/docker/image"
        "github.com/docker/docker/layer"
        "github.com/opencontainers/go-digest"
)

// CreateID creates an ID from v1 image, layerID and parent ID.
// Used for backwards compatibility with old clients.
func CreateID(v1Image image.V1Image, layerID layer.ChainID, parent digest.Digest) (digest.Digest, error) {
        v1Image.ID = ""
        v1JSON, err := json.Marshal(v1Image)
        if err != nil {
                return "", err
        }

        var config map[string]*json.RawMessage
        if err := json.Unmarshal(v1JSON, &config); err != nil {
                return "", err
        }

        // FIXME: note that this is slightly incompatible with RootFS logic
        config["layer_id"] = rawJSON(layerID)
        if parent != "" {
                config["parent"] = rawJSON(parent)
        }

        configJSON, err := json.Marshal(config)
        if err != nil {
                return "", err
        }
        log.G(context.TODO()).Debugf("CreateV1ID %s", configJSON)

        return digest.FromBytes(configJSON), nil
}

func rawJSON(value interface{}) *json.RawMessage {
        jsonval, err := json.Marshal(value)
        if err != nil {
                return nil
        }
        return (*json.RawMessage)(&jsonval)
}

package cleanups

import (
        "context"

        "github.com/docker/docker/internal/multierror"
)

type Composite struct {
        cleanups []func(context.Context) error
}

// Add adds a cleanup to be called.
func (c *Composite) Add(f func(context.Context) error) {
        c.cleanups = append(c.cleanups, f)
}

// Call calls all cleanups in reverse order and returns an error combining all
// non-nil errors.
func (c *Composite) Call(ctx context.Context) error {
        err := call(ctx, c.cleanups)
        c.cleanups = nil
        return err
}

// Release removes all cleanups, turning Call into a no-op.
// Caller still can call the cleanups by calling the returned function
// which is equivalent to calling the Call before Release was called.
func (c *Composite) Release() func(context.Context) error {
        cleanups := c.cleanups
        c.cleanups = nil
        return func(ctx context.Context) error {
                return call(ctx, cleanups)
        }
}

func call(ctx context.Context, cleanups []func(context.Context) error) error {
        var errs []error
        for idx := len(cleanups) - 1; idx >= 0; idx-- {
                c := cleanups[idx]
                errs = append(errs, c(ctx))
        }
        return multierror.Join(errs...)
}

//go:build !darwin && !windows

package containerfs

import (
        "os"
        "syscall"
        "time"

        "github.com/moby/sys/mount"
        "github.com/pkg/errors"
)

// EnsureRemoveAll wraps [os.RemoveAll] to check for specific errors that can
// often be remedied.
// Only use [EnsureRemoveAll] if you really want to make every effort to remove
// a directory.
//
// Because of the way [os.Remove] (and by extension [os.RemoveAll]) works, there
// can be a race between reading directory entries and then actually attempting
// to remove everything in the directory.
// These types of errors do not need to be returned since it's ok for the dir to
// be gone we can just retry the remove operation.
//
// This should not return a [os.ErrNotExist] kind of error under any circumstances.
func EnsureRemoveAll(dir string) error {
        notExistErr := make(map[string]bool)

        // track retries
        exitOnErr := make(map[string]int)
        maxRetry := 50

        // Attempt to unmount anything beneath this dir first
        mount.RecursiveUnmount(dir)

        for {
                err := os.RemoveAll(dir)
                if err == nil {
                        return nil
                }

                pe, ok := err.(*os.PathError)
                if !ok {
                        return err
                }

                if os.IsNotExist(err) {
                        if notExistErr[pe.Path] {
                                return err
                        }
                        notExistErr[pe.Path] = true

                        // There is a race where some subdir can be removed but after the parent
                        //   dir entries have been read.
                        // So the path could be from `os.Remove(subdir)`
                        // If the reported non-existent path is not the passed in `dir` we
                        // should just retry, but otherwise return with no error.
                        if pe.Path == dir {
                                return nil
                        }
                        continue
                }

                if !errors.Is(pe.Err, syscall.EBUSY) {
                        return err
                }

                if e := mount.Unmount(pe.Path); e != nil {
                        return errors.Wrapf(e, "error while removing %s", dir)
                }

                if exitOnErr[pe.Path] == maxRetry {
                        return err
                }
                exitOnErr[pe.Path]++
                time.Sleep(100 * time.Millisecond)
        }
}

package ioutils

import (
        "context"
        "io"
)

// CopyCtx copies from src to dst until either EOF is reached on src or a context is cancelled.
// The writer is not closed when the context is cancelled.
//
// After CopyCtx exits due to context cancellation, the goroutine that performed
// the copy may still be running if either the reader or writer blocks.
func CopyCtx(ctx context.Context, dst io.Writer, src io.Reader) (n int64, err error) {
        copyDone := make(chan struct{})

        src = &readerCtx{ctx: ctx, r: src}

        go func() {
                n, err = io.Copy(dst, src)
                close(copyDone)
        }()

        select {
        case <-ctx.Done():
                return -1, ctx.Err()
        case <-copyDone:
        }

        return n, err
}

type readerCtx struct {
        ctx context.Context
        r   io.Reader
}

// NewCtxReader wraps the given reader with a reader that doesn't proceed with
// reading if the context is done.
//
// Note: Read will still block if the underlying reader blocks.
func NewCtxReader(ctx context.Context, r io.Reader) io.Reader {
        return &readerCtx{ctx: ctx, r: r}
}

func (r *readerCtx) Read(p []byte) (n int, err error) {
        if err := r.ctx.Err(); err != nil {
                return 0, err
        }

        n, outErr := r.r.Read(p)

        if err := r.ctx.Err(); err != nil {
                return 0, err
        }

        return n, outErr
}

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Code below was largely copied from golang.org/x/mod@v0.22;
// https://github.com/golang/mod/blob/v0.22.0/internal/lazyregexp/lazyre.go
// with some additional methods added.

// Package lazyregexp is a thin wrapper over regexp, allowing the use of global
// regexp variables without forcing them to be compiled at init.
package lazyregexp

import (
        "os"
        "regexp"
        "strings"
        "sync"
)

// Regexp is a wrapper around [regexp.Regexp], where the underlying regexp will be
// compiled the first time it is needed.
type Regexp struct {
        str  string
        once sync.Once
        rx   *regexp.Regexp
}

func (r *Regexp) re() *regexp.Regexp {
        r.once.Do(r.build)
        return r.rx
}

func (r *Regexp) build() {
        r.rx = regexp.MustCompile(r.str)
        r.str = ""
}

func (r *Regexp) FindSubmatch(s []byte) [][]byte {
        return r.re().FindSubmatch(s)
}

func (r *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
        return r.re().FindAllStringSubmatch(s, n)
}

func (r *Regexp) FindStringSubmatch(s string) []string {
        return r.re().FindStringSubmatch(s)
}

func (r *Regexp) FindStringSubmatchIndex(s string) []int {
        return r.re().FindStringSubmatchIndex(s)
}

func (r *Regexp) ReplaceAllString(src, repl string) string {
        return r.re().ReplaceAllString(src, repl)
}

func (r *Regexp) FindString(s string) string {
        return r.re().FindString(s)
}

func (r *Regexp) FindAllString(s string, n int) []string {
        return r.re().FindAllString(s, n)
}

func (r *Regexp) MatchString(s string) bool {
        return r.re().MatchString(s)
}

func (r *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
        return r.re().ReplaceAllStringFunc(src, repl)
}

func (r *Regexp) SubexpNames() []string {
        return r.re().SubexpNames()
}

var inTest = len(os.Args) > 0 && strings.HasSuffix(strings.TrimSuffix(os.Args[0], ".exe"), ".test")

// New creates a new lazy regexp, delaying the compiling work until it is first
// needed. If the code is being run as part of tests, the regexp compiling will
// happen immediately.
func New(str string) *Regexp {
        lr := &Regexp{str: str}
        if inTest {
                // In tests, always compile the regexps early.
                lr.re()
        }
        return lr
}

package multierror

import (
        "strings"
)

// Join is a drop-in replacement for errors.Join with better formatting.
func Join(errs ...error) error {
        n := 0
        for _, err := range errs {
                if err != nil {
                        n++
                }
        }
        if n == 0 {
                return nil
        }
        e := &joinError{
                errs: make([]error, 0, n),
        }
        for _, err := range errs {
                if err != nil {
                        e.errs = append(e.errs, err)
                }
        }
        return e
}

type joinError struct {
        errs []error
}

func (e *joinError) Error() string {
        if len(e.errs) == 1 {
                return strings.TrimSpace(e.errs[0].Error())
        }
        stringErrs := make([]string, 0, len(e.errs))
        for _, subErr := range e.errs {
                stringErrs = append(stringErrs, strings.ReplaceAll(subErr.Error(), "\n", "\n\t"))
        }
        return "* " + strings.Join(stringErrs, "\n* ")
}

func (e *joinError) Unwrap() []error {
        return e.errs
}

// Package nlwrap wraps vishvandanda/netlink functions that may return EINTR.
//
// A Handle instantiated using [NewHandle] or [NewHandleAt] can be used in place
// of a netlink.Handle, it's a wrapper that replaces methods that need to be
// wrapped. Functions that use the package handle need to be called as "nlwrap.X"
// instead of "netlink.X".
//
// When netlink.ErrDumpInterrupted is returned, the wrapped functions retry up to
// maxAttempts times. This error means NLM_F_DUMP_INTR was flagged in a netlink
// response, meaning something changed during the dump so results may be
// incomplete or inconsistent.
//
// To avoid retrying indefinitely, if netlink.ErrDumpInterrupted is still
// returned after maxAttempts, the wrapped functions will discard the error, log
// a stack trace to make the issue visible and aid in debugging, and return the
// possibly inconsistent results. Returning possibly inconsistent results matches
// the behaviour of vishvananda/netlink versions prior to 1.2.1, in which the
// NLM_F_DUMP_INTR flag was ignored.
package nlwrap

import (
        "context"

        "github.com/containerd/log"
        "github.com/pkg/errors"
        "github.com/vishvananda/netlink"
        "github.com/vishvananda/netns"
)

// Arbitrary limit on max attempts at netlink calls if they are repeatedly interrupted.
const maxAttempts = 5

type Handle struct {
        *netlink.Handle
}

func NewHandle(nlFamilies ...int) (Handle, error) {
        nlh, err := netlink.NewHandle(nlFamilies...)
        if err != nil {
                return Handle{}, err
        }
        return Handle{nlh}, nil
}

func NewHandleAt(ns netns.NsHandle, nlFamilies ...int) (Handle, error) {
        nlh, err := netlink.NewHandleAt(ns, nlFamilies...)
        if err != nil {
                return Handle{}, err
        }
        return Handle{nlh}, nil
}

func (nlh Handle) Close() {
        if nlh.Handle != nil {
                nlh.Handle.Close()
        }
}

func retryOnIntr(f func() error) {
        for attempt := 0; attempt < maxAttempts; attempt++ {
                if err := f(); !errors.Is(err, netlink.ErrDumpInterrupted) {
                        return
                }
        }
        log.G(context.TODO()).Infof("netlink call interrupted after %d attempts", maxAttempts)
}

func discardErrDumpInterrupted(err error) error {
        if errors.Is(err, netlink.ErrDumpInterrupted) {
                // The netlink function has returned possibly-inconsistent data along with the
                // error. Discard the error and return the data. This restores the behaviour of
                // the netlink package prior to v1.2.1, in which NLM_F_DUMP_INTR was ignored in
                // the netlink response.
                log.G(context.TODO()).Warnf("discarding ErrDumpInterrupted: %+v", errors.WithStack(err))
                return nil
        }
        return err
}

// AddrList calls nlh.Handle.AddrList, retrying if necessary.
func (nlh Handle) AddrList(link netlink.Link, family int) (addrs []netlink.Addr, err error) {
        retryOnIntr(func() error {
                addrs, err = nlh.Handle.AddrList(link, family) //nolint:forbidigo
                return err
        })
        return addrs, discardErrDumpInterrupted(err)
}

// AddrList calls netlink.AddrList, retrying if necessary.
func AddrList(link netlink.Link, family int) (addrs []netlink.Addr, err error) {
        retryOnIntr(func() error {
                addrs, err = netlink.AddrList(link, family) //nolint:forbidigo
                return err
        })
        return addrs, discardErrDumpInterrupted(err)
}

// ConntrackDeleteFilters calls nlh.Handle.ConntrackDeleteFilters, retrying if necessary.
func (nlh Handle) ConntrackDeleteFilters(
        table netlink.ConntrackTableType,
        family netlink.InetFamily,
        filters ...netlink.CustomConntrackFilter,
) (matched uint, err error) {
        retryOnIntr(func() error {
                matched, err = nlh.Handle.ConntrackDeleteFilters(table, family, filters...) //nolint:forbidigo
                return err
        })
        return matched, discardErrDumpInterrupted(err)
}

// ConntrackTableList calls netlink.ConntrackTableList, retrying if necessary.
func ConntrackTableList(
        table netlink.ConntrackTableType,
        family netlink.InetFamily,
) (flows []*netlink.ConntrackFlow, err error) {
        retryOnIntr(func() error {
                flows, err = netlink.ConntrackTableList(table, family) //nolint:forbidigo
                return err
        })
        return flows, discardErrDumpInterrupted(err)
}

// LinkByName calls nlh.Handle.LinkByName, retrying if necessary. The netlink function
// doesn't normally ask the kernel for a dump of links. But, on an old kernel, it
// will do as a fallback and that dump may get inconsistent results.
func (nlh Handle) LinkByName(name string) (link netlink.Link, err error) {
        retryOnIntr(func() error {
                link, err = nlh.Handle.LinkByName(name) //nolint:forbidigo
                return err
        })
        return link, discardErrDumpInterrupted(err)
}

// LinkByName calls netlink.LinkByName, retrying if necessary. The netlink
// function doesn't normally ask the kernel for a dump of links. But, on an old
// kernel, it will do as a fallback and that dump may get inconsistent results.
func LinkByName(name string) (link netlink.Link, err error) {
        retryOnIntr(func() error {
                link, err = netlink.LinkByName(name) //nolint:forbidigo
                return err
        })
        return link, discardErrDumpInterrupted(err)
}

// LinkList calls nlh.Handle.LinkList, retrying if necessary.
func (nlh Handle) LinkList() (links []netlink.Link, err error) {
        retryOnIntr(func() error {
                links, err = nlh.Handle.LinkList() //nolint:forbidigo
                return err
        })
        return links, discardErrDumpInterrupted(err)
}

// LinkList calls netlink.Handle.LinkList, retrying if necessary.
func LinkList() (links []netlink.Link, err error) {
        retryOnIntr(func() error {
                links, err = netlink.LinkList() //nolint:forbidigo
                return err
        })
        return links, discardErrDumpInterrupted(err)
}

// LinkSubscribeWithOptions calls netlink.LinkSubscribeWithOptions, retrying if necessary.
// Close the done channel when done (rather than just sending on it), so that goroutines
// started by the netlink package are all stopped.
func LinkSubscribeWithOptions(ch chan<- netlink.LinkUpdate, done <-chan struct{}, options netlink.LinkSubscribeOptions) (err error) {
        retryOnIntr(func() error {
                err = netlink.LinkSubscribeWithOptions(ch, done, options) //nolint:forbidigo
                return err
        })
        return err
}

// RouteList calls nlh.Handle.RouteList, retrying if necessary.
func (nlh Handle) RouteList(link netlink.Link, family int) (routes []netlink.Route, err error) {
        retryOnIntr(func() error {
                routes, err = nlh.Handle.RouteList(link, family) //nolint:forbidigo
                return err
        })
        return routes, discardErrDumpInterrupted(err)
}

// XfrmPolicyList calls nlh.Handle.XfrmPolicyList, retrying if necessary.
func (nlh Handle) XfrmPolicyList(family int) (policies []netlink.XfrmPolicy, err error) {
        retryOnIntr(func() error {
                policies, err = nlh.Handle.XfrmPolicyList(family) //nolint:forbidigo
                return err
        })
        return policies, discardErrDumpInterrupted(err)
}

// XfrmStateList calls nlh.Handle.XfrmStateList, retrying if necessary.
func (nlh Handle) XfrmStateList(family int) (states []netlink.XfrmState, err error) {
        retryOnIntr(func() error {
                states, err = nlh.Handle.XfrmStateList(family) //nolint:forbidigo
                return err
        })
        return states, discardErrDumpInterrupted(err)
}

package opts

import (
        "errors"
        "net/netip"
)

// ValidateHostGatewayIPs makes sure the addresses are valid, and there's at-most one IPv4 and one IPv6 address.
func ValidateHostGatewayIPs(hostGatewayIPs []netip.Addr) error {
        var have4, have6 bool
        for _, ip := range hostGatewayIPs {
                if ip.Is4() {
                        if have4 {
                                return errors.New("only one IPv4 host gateway IP address can be specified")
                        }
                        have4 = true
                } else {
                        if have6 {
                                return errors.New("only one IPv6 host gateway IP address can be specified")
                        }
                        have6 = true
                }
        }
        return nil
}

package opts

import (
        "fmt"
        "net/netip"
)

// NamedIPListOpts appends to an underlying []netip.Addr.
type NamedIPListOpts struct {
        name string
        ips  *[]netip.Addr
}

// NewNamedIPListOptsRef constructs a NamedIPListOpts and returns its address.
func NewNamedIPListOptsRef(name string, values *[]netip.Addr) *NamedIPListOpts {
        return &NamedIPListOpts{
                name: name,
                ips:  values,
        }
}

// String returns a string representation of the addresses in the underlying []netip.Addr.
func (o *NamedIPListOpts) String() string {
        if len(*o.ips) == 0 {
                return ""
        }
        return fmt.Sprintf("%v", *o.ips)
}

// Set converts value to a netip.Addr and appends it to the underlying []netip.Addr.
func (o *NamedIPListOpts) Set(value string) error {
        ip, err := netip.ParseAddr(value)
        if err != nil {
                return err
        }
        *o.ips = append(*o.ips, ip)
        return nil
}

// Type returns a string name for this Option type
func (o *NamedIPListOpts) Type() string {
        return "list"
}

// Name returns the name of the NamedIPListOpts in the configuration.
func (o *NamedIPListOpts) Name() string {
        return o.name
}

package opts

import (
        "errors"
        "fmt"
        "strconv"
        "strings"

        "github.com/docker/docker/daemon/pkg/opts"
)

// SetOpts holds a map of values and a validation function.
type SetOpts struct {
        values map[string]bool
}

// Set validates if needed the input value and add it to the
// internal map, by splitting on '='.
func (opts *SetOpts) Set(value string) error {
        k, v, found := strings.Cut(value, "=")
        if k == "" {
                return errors.New("invalid option name: " + value)
        }
        var isSet bool
        if !found {
                isSet = true
        } else {
                var err error
                isSet, err = strconv.ParseBool(v)
                if err != nil {
                        return err
                }
        }
        opts.values[k] = isSet
        return nil
}

// GetAll returns the values of SetOpts as a map.
func (opts *SetOpts) GetAll() map[string]bool {
        return opts.values
}

func (opts *SetOpts) String() string {
        return fmt.Sprintf("%v", opts.values)
}

// Type returns a string name for this Option type
func (opts *SetOpts) Type() string {
        return "map"
}

// NewSetOpts creates a new SetOpts with the specified set of values as a map of string to bool.
func NewSetOpts(values map[string]bool) *SetOpts {
        if values == nil {
                values = make(map[string]bool)
        }
        return &SetOpts{
                values: values,
        }
}

// NamedSetOpts is a SetOpts struct with a configuration name.
// This struct is useful to keep reference to the assigned
// field name in the internal configuration struct.
type NamedSetOpts struct {
        SetOpts
        name string
}

var _ opts.NamedOption = &NamedSetOpts{}

// NewNamedSetOpts creates a reference to a new NamedSetOpts struct.
func NewNamedSetOpts(name string, values map[string]bool) *NamedSetOpts {
        return &NamedSetOpts{
                SetOpts: *NewSetOpts(values),
                name:    name,
        }
}

// Name returns the name of the NamedSetOpts in the configuration.
func (o *NamedSetOpts) Name() string {
        return o.name
}

package otelutil

import (
        "context"

        "github.com/containerd/log"
        "go.opentelemetry.io/otel/baggage"
)

// TriggerKey is the key used for the 'trigger' member in the baggage. It is
// used to know what triggered a code path (e.g. API call, libnet init, etc...)
const TriggerKey = "trigger"

// MustNewBaggage creates an OTel Baggage containing the provided members. It
// panics if the baggage cannot be created.
//
// DO NOT USE this function with dynamic values.
func MustNewBaggage(members ...baggage.Member) baggage.Baggage {
        b, err := baggage.New(members...)
        if err != nil {
                log.G(context.Background()).WithFields(log.Fields{
                        "error":   err,
                        "members": members,
                }).Fatal("OTel baggage creation failure")
        }
        return b
}

// MustNewMemberRaw creates an OTel Baggage member with the provided key and
// value. It panics if the key or value aren't valid UTF-8 strings.
//
// DO NOT USE this function with dynamic key/value.
func MustNewMemberRaw(key, value string) baggage.Member {
        m, err := baggage.NewMemberRaw(key, value)
        if err != nil {
                log.G(context.Background()).WithFields(log.Fields{
                        "error": err,
                        "key":   key,
                        "value": value,
                }).Fatal("OTel baggage member creation failure")
        }
        return m
}

package otelutil

import (
        "os"
)

const (
        traceParentKey = "traceparent"
        traceStateKey  = "tracestate"

        // See https://github.com/open-telemetry/opentelemetry-specification/issues/740
        // and https://github.com/open-telemetry/oteps/pull/258.
        traceParentEnvVar = "TRACEPARENT"
        traceStateEnvVar  = "TRACESTATE"
)

type EnvironCarrier struct {
        TraceParent, TraceState string
}

// Get returns the value associated with the passed key.
func (c *EnvironCarrier) Get(key string) string {
        switch key {
        case traceParentKey:
                return c.TraceParent
        case traceStateKey:
                return c.TraceState
        }
        return ""
}

// Set stores the key-value pair.
func (c *EnvironCarrier) Set(key, value string) {
        switch key {
        case traceParentKey:
                c.TraceParent = value
        case traceStateKey:
                c.TraceState = value
        }
        // Other keys are not supported at this time.
}

// Keys lists the keys stored in this carrier.
func (c *EnvironCarrier) Keys() []string {
        var k []string
        if c.TraceParent != "" {
                k = append(k, traceParentKey)
        }
        if c.TraceState != "" {
                k = append(k, traceStateKey)
        }
        return k
}

func (c *EnvironCarrier) Environ() []string {
        var env []string
        if c.TraceParent != "" {
                env = append(env, traceParentEnvVar+"="+c.TraceParent)
        }
        if c.TraceState != "" {
                env = append(env, traceStateEnvVar+"="+c.TraceState)
        }
        return env
}

func PropagateFromEnvironment() *EnvironCarrier {
        return &EnvironCarrier{
                TraceParent: os.Getenv(traceParentEnvVar),
                TraceState:  os.Getenv(traceStateEnvVar),
        }
}

package otelutil

import (
        "context"

        "github.com/containerd/log"
        "github.com/moby/buildkit/util/tracing/detect"
        "go.opentelemetry.io/contrib/processors/baggagecopy"
        "go.opentelemetry.io/otel/baggage"
        "go.opentelemetry.io/otel/sdk/resource"
        sdktrace "go.opentelemetry.io/otel/sdk/trace"
        "go.opentelemetry.io/otel/trace"
        "go.opentelemetry.io/otel/trace/noop"
)

func NewTracerProvider(ctx context.Context, allowNoop bool) (trace.TracerProvider, func(context.Context) error) {
        noopShutdown := func(ctx context.Context) error { return nil }

        exp, err := detect.NewSpanExporter(ctx)
        if err != nil {
                log.G(ctx).WithError(err).Warn("Failed to initialize tracing, skipping")
                if allowNoop {
                        return noop.NewTracerProvider(), noopShutdown
                }
        }

        if allowNoop && detect.IsNoneSpanExporter(exp) {
                log.G(ctx).Info("OTEL tracing is not configured, using no-op tracer provider")
                return noop.NewTracerProvider(), noopShutdown
        }

        tp := sdktrace.NewTracerProvider(
                sdktrace.WithResource(resource.Default()),
                sdktrace.WithSyncer(detect.Recorder),
                sdktrace.WithBatcher(exp),
                sdktrace.WithSpanProcessor(baggagecopy.NewSpanProcessor(func(member baggage.Member) bool { return true })),
        )
        return tp, tp.Shutdown
}

package otelutil

import (
        "go.opentelemetry.io/otel/codes"
        "go.opentelemetry.io/otel/trace"
)

// RecordStatus records the status of a span based on the error provided.
//
// If err is nil, the span status is unmodified. If err is not nil, the span
// takes status Error, and the error message is recorded.
func RecordStatus(span trace.Span, err error) {
        if err != nil {
                span.RecordError(err)
                span.SetStatus(codes.Error, err.Error())
        }
}

package platform

import (
        "context"
        "runtime"
        "sync"

        "github.com/containerd/log"
)

var (
        arch     string
        onceArch sync.Once
)

// Architecture returns the runtime architecture of the process.
//
// Unlike [runtime.GOARCH] (which refers to the compiler platform),
// Architecture refers to the running platform.
//
// For example, Architecture reports "x86_64" as architecture, even
// when running a "linux/386" compiled binary on "linux/amd64" hardware.
func Architecture() string {
        onceArch.Do(func() {
                var err error
                arch, err = runtimeArchitecture()
                if err != nil {
                        log.G(context.TODO()).WithError(err).Error("Could not read system architecture info")
                }
        })
        return arch
}

// PossibleCPU returns the set of possible CPUs on the host (which is equal or
// larger to the number of CPUs currently online). The returned set may be a
// single CPU number ({0}), or a continuous range of CPU numbers ({0,1,2,3}), or
// a non-continuous range of CPU numbers ({0,1,2,3,12,13,14,15}).
func PossibleCPU() []int {
        if ncpu := possibleCPUs(); ncpu != nil {
                return ncpu
        }

        // Fallback in case possibleCPUs() fails.
        var cpus []int
        ncpu := runtime.NumCPU()
        for i := 0; i <= ncpu; i++ {
                cpus = append(cpus, i)
        }

        return cpus
}

// TODO(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package platform

import (
        "os"
        "strconv"
        "strings"
        "sync"
)

// possibleCPUs returns the set of possible CPUs on the host (which is
// equal or larger to the number of CPUs currently online). The returned
// set may be a single number ({0}), or a continuous range ({0,1,2,3}), or
// a non-continuous range ({0,1,2,3,12,13,14,15})
//
// Returns nil on errors. Assume CPUs are 0 -> runtime.NumCPU() in that case.
var possibleCPUs = sync.OnceValue(func() []int {
        data, err := os.ReadFile("/sys/devices/system/cpu/possible")
        if err != nil {
                return nil
        }
        content := strings.TrimSpace(string(data))
        return parsePossibleCPUs(content)
})

func parsePossibleCPUs(content string) []int {
        ranges := strings.Split(content, ",")

        var cpus []int
        for _, r := range ranges {
                // Each entry is either a single number (e.g., "0") or a continuous range
                // (e.g., "0-3").
                if rStart, rEnd, ok := strings.Cut(r, "-"); !ok {
                        cpu, err := strconv.Atoi(rStart)
                        if err != nil {
                                return nil
                        }
                        cpus = append(cpus, cpu)
                } else {
                        var start, end int
                        start, err := strconv.Atoi(rStart)
                        if err != nil {
                                return nil
                        }
                        end, err = strconv.Atoi(rEnd)
                        if err != nil {
                                return nil
                        }
                        for i := start; i <= end; i++ {
                                cpus = append(cpus, i)
                        }
                }
        }

        return cpus
}

//go:build !windows

package platform

import (
        "golang.org/x/sys/unix"
)

// runtimeArchitecture gets the name of the current architecture (x86, x86_64, i86pc, sun4v, ...)
func runtimeArchitecture() (string, error) {
        utsname := &unix.Utsname{}
        if err := unix.Uname(utsname); err != nil {
                return "", err
        }
        return unix.ByteSliceToString(utsname.Machine[:]), nil
}

package mountopts

import (
        "golang.org/x/sys/unix"
)

// UnprivilegedMountFlags gets the set of mount flags that are set on the mount that contains the given
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
// bind-mounting "with options" will not fail with user namespaces, due to
// kernel restrictions that require user namespace mounts to preserve
// CL_UNPRIVILEGED locked flags.
//
// TODO: Move to github.com/moby/sys/mount, and update BuildKit copy of this code as well (https://github.com/moby/buildkit/blob/v0.13.0/util/rootless/mountopts/mountopts_linux.go#L11-L18)
func UnprivilegedMountFlags(path string) ([]string, error) {
        var statfs unix.Statfs_t
        if err := unix.Statfs(path, &statfs); err != nil {
                return nil, err
        }

        // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
        unprivilegedFlags := map[uint64]string{
                unix.MS_RDONLY:     "ro",
                unix.MS_NODEV:      "nodev",
                unix.MS_NOEXEC:     "noexec",
                unix.MS_NOSUID:     "nosuid",
                unix.MS_NOATIME:    "noatime",
                unix.MS_RELATIME:   "relatime",
                unix.MS_NODIRATIME: "nodiratime",
        }

        var flags []string
        for mask, flag := range unprivilegedFlags {
                if uint64(statfs.Flags)&mask == mask {
                        flags = append(flags, flag)
                }
        }

        return flags, nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package sliceutil

func Dedup[T comparable](slice []T) []T {
        keys := make(map[T]struct{})
        out := make([]T, 0, len(slice))
        for _, s := range slice {
                if _, ok := keys[s]; !ok {
                        out = append(out, s)
                        keys[s] = struct{}{}
                }
        }
        return out
}

func Map[S ~[]In, In, Out any](s S, fn func(In) Out) []Out {
        res := make([]Out, len(s))
        for i, v := range s {
                res[i] = fn(v)
        }
        return res
}

func Mapper[In, Out any](fn func(In) Out) func([]In) []Out {
        return func(s []In) []Out {
                res := make([]Out, len(s))
                for i, v := range s {
                        res[i] = fn(v)
                }
                return res
        }
}

package unix_noeintr

import (
        "errors"

        "golang.org/x/sys/unix"
)

func EpollCreate() (int, error) {
        for {
                fd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC)
                if errors.Is(err, unix.EINTR) {
                        continue
                }
                return fd, err
        }
}

func EpollCtl(epFd int, op int, fd int, event *unix.EpollEvent) error {
        for {
                err := unix.EpollCtl(epFd, op, fd, event)
                if errors.Is(err, unix.EINTR) {
                        continue
                }
                return err
        }
}

func EpollWait(epFd int, events []unix.EpollEvent, msec int) (int, error) {
        for {
                n, err := unix.EpollWait(epFd, events, msec)
                if errors.Is(err, unix.EINTR) {
                        continue
                }
                return n, err
        }
}

//go:build !windows

// Package unix_noeintr provides wrappers for unix syscalls that retry on EINTR.
//
// TODO: Consider moving (for example to moby/sys) and making the wrappers auto-generated.
package unix_noeintr

import (
        "errors"

        "golang.org/x/sys/unix"
)

func Retry(f func() error) {
        for {
                err := f()
                if !errors.Is(err, unix.EINTR) {
                        return
                }
        }
}

func Mount(source string, target string, fstype string, flags uintptr, data string) (err error) {
        Retry(func() error {
                err = unix.Mount(source, target, fstype, flags, data)
                return err
        })
        return err
}

func Unmount(target string, flags int) (err error) {
        Retry(func() error {
                err = unix.Unmount(target, flags)
                return err
        })
        return err
}

func Open(path string, mode int, perm uint32) (fd int, err error) {
        Retry(func() error {
                fd, err = unix.Open(path, mode, perm)
                return err
        })
        return fd, err
}

func Close(fd int) (err error) {
        Retry(func() error {
                err = unix.Close(fd)
                return err
        })
        return err
}

func Openat(dirfd int, path string, mode int, perms uint32) (fd int, err error) {
        Retry(func() error {
                fd, err = unix.Openat(dirfd, path, mode, perms)
                return err
        })
        return fd, err
}

func Openat2(dirfd int, path string, how *unix.OpenHow) (fd int, err error) {
        Retry(func() error {
                fd, err = unix.Openat2(dirfd, path, how)
                return err
        })
        return fd, err
}

func Fstat(fd int, stat *unix.Stat_t) (err error) {
        Retry(func() error {
                err = unix.Fstat(fd, stat)
                return err
        })
        return err
}

func Fstatat(fd int, path string, stat *unix.Stat_t, flags int) (err error) {
        Retry(func() error {
                err = unix.Fstatat(fd, path, stat, flags)
                return err
        })
        return err
}

//go:build go1.10

package unshare

import (
        "fmt"
        "os"
        "runtime"

        "golang.org/x/sys/unix"
)

func init() {
        // The startup thread of a process is special in a few different ways.
        // Most pertinent to the discussion at hand, any per-thread kernel state
        // reflected in the /proc/[pid]/ directory for a process is taken from
        // the state of the startup thread. Same goes for /proc/self/; it shows
        // the state of the current process' startup thread, no matter which
        // thread the files are being opened from. For most programs this is a
        // distinction without a difference as the kernel state, such as the
        // mount namespace and current working directory, is shared among (and
        // kept synchronized across) all threads of a process. But things start
        // to break down once threads start unsharing and modifying parts of
        // their kernel state.
        //
        // The Go runtime schedules goroutines to execute on the startup thread,
        // same as any other. How this could be problematic is best illustrated
        // with a concrete example. Consider what happens if a call to
        // Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
        // onto the startup thread. The thread's mount namespace will be
        // unshared and modified. The contents of the /proc/[pid]/mountinfo file
        // will then describe the mount tree of the unshared namespace, not the
        // namespace of any other thread. It will remain this way until the
        // process exits. (The startup thread is special in another way: exiting
        // it puts the process into a "non-waitable zombie" state. To avoid this
        // fate, the Go runtime parks the thread instead of exiting if a
        // goroutine returns while locked to the startup thread. More
        // information can be found in the Go runtime sources:
        // `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
        // package reads from /proc/self/mountinfo, so will read the mount tree
        // for the wrong namespace if the startup thread has had its mount
        // namespace unshared! The /proc/thread-self/ directory, introduced in
        // Linux 3.17, is one potential solution to this problem, but every
        // package which opens files in /proc/self/ would need to be updated,
        // and fallbacks to /proc/self/task/[tid]/ would be required to support
        // older kernels. Overlooking any reference to /proc/self/ would
        // manifest as stochastically-reproducible bugs, so this is far from an
        // ideal solution.
        //
        // Reading from /proc/self/ would not be a problem if we could prevent
        // the per-thread state of the startup thread from being modified
        // nondeterministically in the first place. We can accomplish this
        // simply by locking the main() function to the startup thread! Doing so
        // excludes any other goroutine from being scheduled on the thread.
        runtime.LockOSThread()
}

// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
// reversed using setns(2). The values are the basenames of the corresponding
// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
// state.
var reversibleSetnsFlags = map[int]string{
        unix.CLONE_NEWCGROUP: "cgroup",
        unix.CLONE_NEWNET:    "net",
        unix.CLONE_NEWUTS:    "uts",
        unix.CLONE_NEWPID:    "pid",
        unix.CLONE_NEWTIME:   "time",

        // The following CLONE_NEW* flags are not included because they imply
        // another, irreversible flag when used with unshare(2).
        //  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
        //  - unix.CLONE_NEWNS:   implies CLONE_FS
        //  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
}

// Go calls the given functions in a new goroutine, locked to an OS thread,
// which has had the parts of its execution state disassociated from the rest of
// the current process using [unshare(2)]. It blocks until the new goroutine has
// started and setupfn has returned. fn is only called if setupfn returns nil. A
// nil setupfn or fn is equivalent to passing a no-op function.
//
// The disassociated execution state and any changes made to it are only visible
// to the goroutine which the functions are called in. Any other goroutines,
// including ones started from the function, will see the same execution state
// as the rest of the process.
//
// The acceptable flags are documented in the [unshare(2)] Linux man-page.
// The corresponding CLONE_* constants are defined in package [unix].
//
// # Warning
//
// This function may terminate the thread which the new goroutine executed on
// after fn returns, which could cause subprocesses started with the
// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
// termination. Any subprocess started before this function is called may be
// affected, in addition to any subprocesses started inside setupfn or fn.
// There are more details at https://go.dev/issue/27505.
//
// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
func Go(flags int, setupfn func() error, fn func()) error {
        started := make(chan error)

        maskedFlags := flags
        for f := range reversibleSetnsFlags {
                maskedFlags &^= f
        }
        isReversible := maskedFlags == 0

        go func() {
                // Prepare to manipulate per-thread kernel state.
                runtime.LockOSThread()

                // Not all changes to the execution state can be reverted.
                // If an irreversible change to the execution state is made, our
                // only recourse is to have the tampered thread terminated by
                // returning from this function while the goroutine remains
                // wired to the thread. The Go runtime will terminate the thread
                // and replace it with a fresh one as needed.

                if isReversible {
                        defer func() {
                                if isReversible {
                                        // All execution state has been restored without error.
                                        // The thread is once again fungible.
                                        runtime.UnlockOSThread()
                                }
                        }()
                        tid := unix.Gettid()
                        for f, ns := range reversibleSetnsFlags {
                                if flags&f != f {
                                        continue
                                }
                                // The /proc/thread-self directory was added in Linux 3.17.
                                // We are not using it to maximize compatibility.
                                pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
                                fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
                                if err != nil {
                                        started <- &os.PathError{Op: "open", Path: pth, Err: err}
                                        return
                                }
                                defer func() {
                                        if isReversible {
                                                if err := unix.Setns(fd, 0); err != nil {
                                                        isReversible = false
                                                }
                                        }
                                        _ = unix.Close(fd)
                                }()
                        }
                }

                // Threads are implemented under Linux as processes which share
                // a virtual memory space. Therefore in a multithreaded process
                // unshare(2) disassociates parts of the calling thread's
                // context from the thread it was clone(2)'d from.
                if err := unix.Unshare(flags); err != nil {
                        started <- os.NewSyscallError("unshare", err)
                        return
                }

                if setupfn != nil {
                        if err := setupfn(); err != nil {
                                started <- err
                                return
                        }
                }
                close(started)

                if fn != nil {
                        fn()
                }
        }()

        return <-started
}

package layer

import (
        "archive/tar"
        "bytes"
        "errors"
        "io"
)

// DigestSHA256EmptyTar is the canonical sha256 digest of empty tar file -
// (1024 NULL bytes)
const DigestSHA256EmptyTar DiffID = "sha256:5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef"

type emptyLayer struct{}

// EmptyLayer is a layer that corresponds to empty tar.
var EmptyLayer = &emptyLayer{}

func (el *emptyLayer) TarStream() (io.ReadCloser, error) {
        buf := new(bytes.Buffer)
        tarWriter := tar.NewWriter(buf)
        _ = tarWriter.Close()
        return io.NopCloser(buf), nil
}

func (el *emptyLayer) TarStreamFrom(p ChainID) (io.ReadCloser, error) {
        if p == "" {
                return el.TarStream()
        }
        return nil, errors.New("can't get parent tar stream of an empty layer")
}

func (el *emptyLayer) ChainID() ChainID {
        return DigestSHA256EmptyTar
}

func (el *emptyLayer) DiffID() DiffID {
        return DigestSHA256EmptyTar
}

func (el *emptyLayer) Parent() Layer {
        return nil
}

func (el *emptyLayer) Size() int64 {
        return 0
}

func (el *emptyLayer) DiffSize() int64 {
        return 0
}

func (el *emptyLayer) Metadata() (map[string]string, error) {
        return make(map[string]string), nil
}

// IsEmpty returns true if the layer is an EmptyLayer
func IsEmpty(diffID DiffID) bool {
        return diffID == DigestSHA256EmptyTar
}

package layer

import (
        "compress/gzip"
        "context"
        "encoding/json"
        "io"
        "os"
        "path/filepath"
        "strconv"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/distribution"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/moby/sys/atomicwriter"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

var supportedAlgorithms = []digest.Algorithm{
        digest.SHA256,
        // digest.SHA384, // Currently not used
        // digest.SHA512, // Currently not used
}

type fileMetadataStore struct {
        root string
}

type fileMetadataTransaction struct {
        store *fileMetadataStore
        ws    *atomicwriter.WriteSet
}

// newFSMetadataStore returns an instance of a metadata store
// which is backed by files on disk using the provided root
// as the root of metadata files.
func newFSMetadataStore(root string) (*fileMetadataStore, error) {
        if err := os.MkdirAll(root, 0o700); err != nil {
                return nil, err
        }
        return &fileMetadataStore{
                root: root,
        }, nil
}

func (fms *fileMetadataStore) getLayerDirectory(layer ChainID) string {
        return filepath.Join(fms.root, string(layer.Algorithm()), layer.Encoded())
}

func (fms *fileMetadataStore) getLayerFilename(layer ChainID, filename string) string {
        return filepath.Join(fms.getLayerDirectory(layer), filename)
}

func (fms *fileMetadataStore) getMountDirectory(mount string) string {
        return filepath.Join(fms.root, "mounts", mount)
}

func (fms *fileMetadataStore) getMountFilename(mount, filename string) string {
        return filepath.Join(fms.getMountDirectory(mount), filename)
}

func (fms *fileMetadataStore) StartTransaction() (*fileMetadataTransaction, error) {
        tmpDir := filepath.Join(fms.root, "tmp")
        if err := os.MkdirAll(tmpDir, 0o755); err != nil {
                return nil, err
        }
        ws, err := atomicwriter.NewWriteSet(tmpDir)
        if err != nil {
                return nil, err
        }

        return &fileMetadataTransaction{
                store: fms,
                ws:    ws,
        }, nil
}

func (fm *fileMetadataTransaction) SetSize(size int64) error {
        return fm.ws.WriteFile("size", []byte(strconv.FormatInt(size, 10)), 0o644)
}

func (fm *fileMetadataTransaction) SetParent(parent ChainID) error {
        return fm.ws.WriteFile("parent", []byte(parent.String()), 0o644)
}

func (fm *fileMetadataTransaction) SetDiffID(diff DiffID) error {
        return fm.ws.WriteFile("diff", []byte(diff.String()), 0o644)
}

func (fm *fileMetadataTransaction) SetCacheID(cacheID string) error {
        return fm.ws.WriteFile("cache-id", []byte(cacheID), 0o644)
}

func (fm *fileMetadataTransaction) SetDescriptor(ref distribution.Descriptor) error {
        jsonRef, err := json.Marshal(ref)
        if err != nil {
                return err
        }
        return fm.ws.WriteFile("descriptor.json", jsonRef, 0o644)
}

func (fm *fileMetadataTransaction) TarSplitWriter(compressInput bool) (io.WriteCloser, error) {
        f, err := fm.ws.FileWriter("tar-split.json.gz", os.O_TRUNC|os.O_CREATE|os.O_WRONLY, 0o644)
        if err != nil {
                return nil, err
        }
        var wc io.WriteCloser
        if compressInput {
                wc = gzip.NewWriter(f)
        } else {
                wc = f
        }

        return ioutils.NewWriteCloserWrapper(wc, func() error {
                wc.Close()
                return f.Close()
        }), nil
}

func (fm *fileMetadataTransaction) Commit(layer ChainID) error {
        finalDir := fm.store.getLayerDirectory(layer)
        if err := os.MkdirAll(filepath.Dir(finalDir), 0o755); err != nil {
                return err
        }

        return fm.ws.Commit(finalDir)
}

func (fm *fileMetadataTransaction) Cancel() error {
        return fm.ws.Cancel()
}

func (fm *fileMetadataTransaction) String() string {
        return fm.ws.String()
}

func (fms *fileMetadataStore) GetSize(layer ChainID) (int64, error) {
        content, err := os.ReadFile(fms.getLayerFilename(layer, "size"))
        if err != nil {
                return 0, err
        }

        size, err := strconv.ParseInt(string(content), 10, 64)
        if err != nil {
                return 0, err
        }

        return size, nil
}

func (fms *fileMetadataStore) GetParent(layer ChainID) (ChainID, error) {
        content, err := os.ReadFile(fms.getLayerFilename(layer, "parent"))
        if err != nil {
                if os.IsNotExist(err) {
                        return "", nil
                }
                return "", err
        }

        dgst, err := digest.Parse(strings.TrimSpace(string(content)))
        if err != nil {
                return "", err
        }

        return dgst, nil
}

func (fms *fileMetadataStore) GetDiffID(layer ChainID) (DiffID, error) {
        content, err := os.ReadFile(fms.getLayerFilename(layer, "diff"))
        if err != nil {
                return "", err
        }

        dgst, err := digest.Parse(strings.TrimSpace(string(content)))
        if err != nil {
                return "", err
        }

        return dgst, nil
}

func (fms *fileMetadataStore) GetCacheID(layer ChainID) (string, error) {
        contentBytes, err := os.ReadFile(fms.getLayerFilename(layer, "cache-id"))
        if err != nil {
                return "", err
        }
        content := strings.TrimSpace(string(contentBytes))

        if content == "" {
                return "", errors.Errorf("invalid cache id value")
        }

        return content, nil
}

func (fms *fileMetadataStore) GetDescriptor(layer ChainID) (distribution.Descriptor, error) {
        content, err := os.ReadFile(fms.getLayerFilename(layer, "descriptor.json"))
        if err != nil {
                if os.IsNotExist(err) {
                        // only return empty descriptor to represent what is stored
                        return distribution.Descriptor{}, nil
                }
                return distribution.Descriptor{}, err
        }

        var ref distribution.Descriptor
        err = json.Unmarshal(content, &ref)
        if err != nil {
                return distribution.Descriptor{}, err
        }
        return ref, err
}

func (fms *fileMetadataStore) TarSplitReader(layer ChainID) (io.ReadCloser, error) {
        fz, err := os.Open(fms.getLayerFilename(layer, "tar-split.json.gz"))
        if err != nil {
                return nil, err
        }
        f, err := gzip.NewReader(fz)
        if err != nil {
                fz.Close()
                return nil, err
        }

        return ioutils.NewReadCloserWrapper(f, func() error {
                f.Close()
                return fz.Close()
        }), nil
}

func (fms *fileMetadataStore) SetMountID(mount string, mountID string) error {
        if err := os.MkdirAll(fms.getMountDirectory(mount), 0o755); err != nil {
                return err
        }
        return os.WriteFile(fms.getMountFilename(mount, "mount-id"), []byte(mountID), 0o644)
}

func (fms *fileMetadataStore) SetInitID(mount string, init string) error {
        if err := os.MkdirAll(fms.getMountDirectory(mount), 0o755); err != nil {
                return err
        }
        return os.WriteFile(fms.getMountFilename(mount, "init-id"), []byte(init), 0o644)
}

func (fms *fileMetadataStore) SetMountParent(mount string, parent ChainID) error {
        if err := os.MkdirAll(fms.getMountDirectory(mount), 0o755); err != nil {
                return err
        }
        return os.WriteFile(fms.getMountFilename(mount, "parent"), []byte(parent.String()), 0o644)
}

func (fms *fileMetadataStore) GetMountID(mount string) (string, error) {
        contentBytes, err := os.ReadFile(fms.getMountFilename(mount, "mount-id"))
        if err != nil {
                return "", err
        }
        content := strings.TrimSpace(string(contentBytes))

        if !isValidID(content) {
                return "", errors.New("invalid mount id value")
        }

        return content, nil
}

func (fms *fileMetadataStore) GetInitID(mount string) (string, error) {
        contentBytes, err := os.ReadFile(fms.getMountFilename(mount, "init-id"))
        if err != nil {
                if os.IsNotExist(err) {
                        return "", nil
                }
                return "", err
        }
        content := strings.TrimSpace(string(contentBytes))

        if !isValidID(content) {
                return "", errors.New("invalid init id value")
        }

        return content, nil
}

func (fms *fileMetadataStore) GetMountParent(mount string) (ChainID, error) {
        content, err := os.ReadFile(fms.getMountFilename(mount, "parent"))
        if err != nil {
                if os.IsNotExist(err) {
                        return "", nil
                }
                return "", err
        }

        dgst, err := digest.Parse(strings.TrimSpace(string(content)))
        if err != nil {
                return "", err
        }

        return dgst, nil
}

func (fms *fileMetadataStore) getOrphan() ([]roLayer, error) {
        var orphanLayers []roLayer
        for _, algorithm := range supportedAlgorithms {
                fileInfos, err := os.ReadDir(filepath.Join(fms.root, string(algorithm)))
                if err != nil {
                        if os.IsNotExist(err) {
                                continue
                        }
                        return nil, err
                }

                for _, fi := range fileInfos {
                        if !fi.IsDir() || !strings.HasSuffix(fi.Name(), "-removing") {
                                continue
                        }
                        // At this stage, fi.Name value looks like <digest>-<random>-removing
                        // Split on '-' to get the digest value.
                        nameSplit := strings.Split(fi.Name(), "-")
                        dgst := digest.NewDigestFromEncoded(algorithm, nameSplit[0])
                        if err := dgst.Validate(); err != nil {
                                log.G(context.TODO()).WithError(err).WithField("digest", string(algorithm)+":"+nameSplit[0]).Debug("ignoring invalid digest")
                                continue
                        }

                        chainFile := filepath.Join(fms.root, string(algorithm), fi.Name(), "cache-id")
                        contentBytes, err := os.ReadFile(chainFile)
                        if err != nil {
                                if !os.IsNotExist(err) {
                                        log.G(context.TODO()).WithError(err).WithField("digest", dgst).Error("failed to read cache ID")
                                }
                                continue
                        }
                        cacheID := strings.TrimSpace(string(contentBytes))
                        if cacheID == "" {
                                log.G(context.TODO()).Error("invalid cache ID")
                                continue
                        }

                        l := &roLayer{
                                chainID: dgst,
                                cacheID: cacheID,
                        }
                        orphanLayers = append(orphanLayers, *l)
                }
        }

        return orphanLayers, nil
}

func (fms *fileMetadataStore) List() ([]ChainID, []string, error) {
        var ids []ChainID
        for _, algorithm := range supportedAlgorithms {
                fileInfos, err := os.ReadDir(filepath.Join(fms.root, string(algorithm)))
                if err != nil {
                        if os.IsNotExist(err) {
                                continue
                        }
                        return nil, nil, err
                }

                for _, fi := range fileInfos {
                        if fi.IsDir() && fi.Name() != "mounts" {
                                dgst := digest.NewDigestFromEncoded(algorithm, fi.Name())
                                if err := dgst.Validate(); err != nil {
                                        log.G(context.TODO()).Debugf("Ignoring invalid digest %s:%s", algorithm, fi.Name())
                                } else {
                                        ids = append(ids, dgst)
                                }
                        }
                }
        }

        fileInfos, err := os.ReadDir(filepath.Join(fms.root, "mounts"))
        if err != nil {
                if os.IsNotExist(err) {
                        return ids, []string{}, nil
                }
                return nil, nil, err
        }

        var mounts []string
        for _, fi := range fileInfos {
                if fi.IsDir() {
                        mounts = append(mounts, fi.Name())
                }
        }

        return ids, mounts, nil
}

// Remove layerdb folder if that is marked for removal
func (fms *fileMetadataStore) Remove(layer ChainID, cache string) error {
        dgst := layer
        files, err := os.ReadDir(filepath.Join(fms.root, string(dgst.Algorithm())))
        if err != nil {
                return err
        }
        for _, f := range files {
                if !strings.HasSuffix(f.Name(), "-removing") || !strings.HasPrefix(f.Name(), dgst.Encoded()) {
                        continue
                }

                // Make sure that we only remove layerdb folder which points to
                // requested cacheID
                dir := filepath.Join(fms.root, string(dgst.Algorithm()), f.Name())
                chainFile := filepath.Join(dir, "cache-id")
                contentBytes, err := os.ReadFile(chainFile)
                if err != nil {
                        log.G(context.TODO()).WithError(err).WithField("file", chainFile).Error("cannot get cache ID")
                        continue
                }
                cacheID := strings.TrimSpace(string(contentBytes))
                if cacheID != cache {
                        continue
                }
                log.G(context.TODO()).Debugf("Removing folder: %s", dir)
                err = os.RemoveAll(dir)
                if err != nil && !os.IsNotExist(err) {
                        log.G(context.TODO()).WithError(err).WithField("name", f.Name()).Error("cannot remove layer")
                        continue
                }
        }
        return nil
}

func (fms *fileMetadataStore) RemoveMount(mount string) error {
        return os.RemoveAll(fms.getMountDirectory(mount))
}

// isValidID checks if mount/init id is valid. It is similar to
// regexp.MustCompile(`^[a-f0-9]{64}(-init)?$`).MatchString(id).
func isValidID(id string) bool {
        id = strings.TrimSuffix(id, "-init")
        if len(id) != 64 {
                return false
        }
        for _, c := range id {
                if (c < '0' || c > '9') && (c < 'a' || c > 'f') {
                        return false
                }
        }
        return true
}

// Package layer is package for managing read-only
// and read-write mounts on the union file system
// driver. Read-only mounts are referenced using a
// content hash and are protected from mutation in
// the exposed interface. The tar format is used
// to create read-only layers and export both
// read-only and writable layers. The exported
// tar data for a read-only layer should match
// the tar used to create the layer.
package layer

import (
        "context"
        "errors"
        "io"

        "github.com/containerd/log"
        "github.com/docker/distribution"
        "github.com/moby/go-archive"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/identity"
)

var (
        // ErrLayerDoesNotExist is used when an operation is
        // attempted on a layer which does not exist.
        ErrLayerDoesNotExist = errors.New("layer does not exist")

        // ErrLayerNotRetained is used when a release is
        // attempted on a layer which is not retained.
        ErrLayerNotRetained = errors.New("layer not retained")

        // ErrMountDoesNotExist is used when an operation is
        // attempted on a mount layer which does not exist.
        ErrMountDoesNotExist = errors.New("mount does not exist")

        // ErrMountNameConflict is used when a mount is attempted
        // to be created but there is already a mount with the name
        // used for creation.
        ErrMountNameConflict = errors.New("mount already exists with name")

        // ErrMaxDepthExceeded is used when a layer is attempted
        // to be created which would result in a layer depth
        // greater than the 125 max.
        ErrMaxDepthExceeded = errors.New("max depth exceeded")
)

// ChainID is the content-addressable ID of a layer.
type ChainID = digest.Digest

// DiffID is the hash of an individual layer tar.
type DiffID = digest.Digest

// TarStreamer represents an object which may
// have its contents exported as a tar stream.
type TarStreamer interface {
        // TarStream returns a tar archive stream
        // for the contents of a layer.
        TarStream() (io.ReadCloser, error)
}

// Layer represents a read-only layer
type Layer interface {
        TarStreamer

        // TarStreamFrom returns a tar archive stream for all the layer chain with
        // arbitrary depth.
        TarStreamFrom(ChainID) (io.ReadCloser, error)

        // ChainID returns the content hash of the entire layer chain. The hash
        // chain is made up of DiffID of top layer and all of its parents.
        ChainID() ChainID

        // DiffID returns the content hash of the layer
        // tar stream used to create this layer.
        DiffID() DiffID

        // Parent returns the next layer in the layer chain.
        Parent() Layer

        // Size returns the size of the entire layer chain. The size
        // is calculated from the total size of all files in the layers.
        Size() int64

        // DiffSize returns the size difference of the top layer
        // from parent layer.
        DiffSize() int64

        // Metadata returns the low level storage metadata associated
        // with layer.
        Metadata() (map[string]string, error)
}

// RWLayer represents a layer which is
// read and writable
type RWLayer interface {
        TarStreamer

        // Name of mounted layer
        Name() string

        // Parent returns the layer which the writable
        // layer was created from.
        Parent() Layer

        // Mount mounts the RWLayer and returns the filesystem path
        // to the writable layer.
        Mount(mountLabel string) (string, error)

        // Unmount unmounts the RWLayer. This should be called
        // for every mount. If there are multiple mount calls
        // this operation will only decrement the internal mount counter.
        Unmount() error

        // Size represents the size of the writable layer
        // as calculated by the total size of the files
        // changed in the mutable layer.
        Size() (int64, error)

        // Changes returns the set of changes for the mutable layer
        // from the base layer.
        Changes() ([]archive.Change, error)

        // Metadata returns the low level metadata for the mutable layer
        Metadata() (map[string]string, error)

        // ApplyDiff applies the diff to the RW layer
        ApplyDiff(diff io.Reader) (int64, error)
}

// Metadata holds information about a
// read-only layer
type Metadata struct {
        // ChainID is the content hash of the layer
        ChainID ChainID

        // DiffID is the hash of the tar data used to
        // create the layer
        DiffID DiffID

        // Size is the size of the layer and all parents
        Size int64

        // DiffSize is the size of the top layer
        DiffSize int64
}

// MountInit is a function to initialize a
// writable mount. Changes made here will
// not be included in the Tar stream of the
// RWLayer.
type MountInit func(root string) error

// CreateRWLayerOpts contains optional arguments to be passed to CreateRWLayer
type CreateRWLayerOpts struct {
        MountLabel string
        InitFunc   MountInit
        StorageOpt map[string]string
}

// Store represents a backend for managing both
// read-only and read-write layers.
type Store interface {
        Register(io.Reader, ChainID) (Layer, error)
        Get(ChainID) (Layer, error)
        Map() map[ChainID]Layer
        Release(Layer) ([]Metadata, error)
        CreateRWLayer(id string, parent ChainID, opts *CreateRWLayerOpts) (RWLayer, error)
        GetRWLayer(id string) (RWLayer, error)
        GetMountID(id string) (string, error)
        ReleaseRWLayer(RWLayer) ([]Metadata, error)
        Cleanup() error
        DriverStatus() [][2]string
        DriverName() string
}

// DescribableStore represents a layer store capable of storing
// descriptors for layers.
type DescribableStore interface {
        RegisterWithDescriptor(io.Reader, ChainID, distribution.Descriptor) (Layer, error)
}

// CreateChainID returns ID for a layerDigest slice.
//
// Deprecated: use [identity.ChainID].
func CreateChainID(dgsts []DiffID) ChainID {
        return identity.ChainID(dgsts)
}

// ReleaseAndLog releases the provided layer from the given layer
// store, logging any error and release metadata
func ReleaseAndLog(ls Store, l Layer) {
        metadata, err := ls.Release(l)
        if err != nil {
                log.G(context.TODO()).Errorf("Error releasing layer %s: %v", l.ChainID(), err)
        }
        for _, m := range metadata {
                log.G(context.TODO()).WithField("chainID", m.ChainID).Infof("Cleaned up layer %s", m.ChainID)
        }
}

package layer

import (
        "context"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/distribution"
        "github.com/docker/docker/daemon/graphdriver"
        "github.com/docker/docker/pkg/stringid"
        "github.com/moby/locker"
        "github.com/moby/sys/user"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/identity"
        "github.com/vbatts/tar-split/tar/asm"
        "github.com/vbatts/tar-split/tar/storage"
)

// maxLayerDepth represents the maximum number of
// layers which can be chained together. 125 was
// chosen to account for the 127 max in some
// graphdrivers plus the 2 additional layers
// used to create a rwlayer.
const maxLayerDepth = 125

type layerStore struct {
        store  *fileMetadataStore
        driver graphdriver.Driver

        layerMap map[ChainID]*roLayer
        layerL   sync.Mutex

        mounts map[string]*mountedLayer
        mountL sync.Mutex

        // protect *RWLayer() methods from operating on the same name/id
        locker *locker.Locker
}

// StoreOptions are the options used to create a new Store instance
type StoreOptions struct {
        Root               string
        GraphDriver        string
        GraphDriverOptions []string
        IDMapping          user.IdentityMapping
}

// NewStoreFromOptions creates a new Store instance
func NewStoreFromOptions(options StoreOptions) (Store, error) {
        driver, err := graphdriver.New(options.GraphDriver, graphdriver.Options{
                Root:          options.Root,
                DriverOptions: options.GraphDriverOptions,
                IDMap:         options.IDMapping,
        })
        if err != nil {
                if options.GraphDriver != "" {
                        return nil, fmt.Errorf("error initializing graphdriver: %v: %s", err, options.GraphDriver)
                }
                return nil, fmt.Errorf("error initializing graphdriver: %v", err)
        }
        log.G(context.TODO()).Debugf("Initialized graph driver %s", driver)

        driverName := driver.String()
        layerDBRoot := filepath.Join(options.Root, "image", driverName, "layerdb")
        return newStoreFromGraphDriver(layerDBRoot, driver)
}

// newStoreFromGraphDriver creates a new Store instance using the provided
// metadata store and graph driver. The metadata store will be used to restore
// the Store.
func newStoreFromGraphDriver(root string, driver graphdriver.Driver) (Store, error) {
        ms, err := newFSMetadataStore(root)
        if err != nil {
                return nil, err
        }

        ls := &layerStore{
                store:    ms,
                driver:   driver,
                layerMap: map[ChainID]*roLayer{},
                mounts:   map[string]*mountedLayer{},
                locker:   locker.New(),
        }

        ids, mounts, err := ms.List()
        if err != nil {
                return nil, err
        }

        for _, id := range ids {
                l, err := ls.loadLayer(id)
                if err != nil {
                        log.G(context.TODO()).Debugf("Failed to load layer %s: %s", id, err)
                        continue
                }
                if l.parent != nil {
                        l.parent.referenceCount++
                }
        }

        for _, mount := range mounts {
                if err := ls.loadMount(mount); err != nil {
                        log.G(context.TODO()).Debugf("Failed to load mount %s: %s", mount, err)
                }
        }

        return ls, nil
}

func (ls *layerStore) Driver() graphdriver.Driver {
        return ls.driver
}

func (ls *layerStore) loadLayer(layer ChainID) (*roLayer, error) {
        cl, ok := ls.layerMap[layer]
        if ok {
                return cl, nil
        }

        diff, err := ls.store.GetDiffID(layer)
        if err != nil {
                return nil, fmt.Errorf("failed to get diff id for %s: %s", layer, err)
        }

        size, err := ls.store.GetSize(layer)
        if err != nil {
                return nil, fmt.Errorf("failed to get size for %s: %s", layer, err)
        }

        cacheID, err := ls.store.GetCacheID(layer)
        if err != nil {
                return nil, fmt.Errorf("failed to get cache id for %s: %s", layer, err)
        }

        parent, err := ls.store.GetParent(layer)
        if err != nil {
                return nil, fmt.Errorf("failed to get parent for %s: %s", layer, err)
        }

        descriptor, err := ls.store.GetDescriptor(layer)
        if err != nil {
                return nil, fmt.Errorf("failed to get descriptor for %s: %s", layer, err)
        }

        cl = &roLayer{
                chainID:    layer,
                diffID:     diff,
                size:       size,
                cacheID:    cacheID,
                layerStore: ls,
                references: map[Layer]struct{}{},
                descriptor: descriptor,
        }

        if parent != "" {
                p, err := ls.loadLayer(parent)
                if err != nil {
                        return nil, err
                }
                cl.parent = p
        }

        ls.layerMap[cl.chainID] = cl

        return cl, nil
}

func (ls *layerStore) loadMount(mount string) error {
        ls.mountL.Lock()
        defer ls.mountL.Unlock()
        if _, ok := ls.mounts[mount]; ok {
                return nil
        }

        mountID, err := ls.store.GetMountID(mount)
        if err != nil {
                return err
        }

        initID, err := ls.store.GetInitID(mount)
        if err != nil {
                return err
        }

        parent, err := ls.store.GetMountParent(mount)
        if err != nil {
                return err
        }

        ml := &mountedLayer{
                name:       mount,
                mountID:    mountID,
                initID:     initID,
                layerStore: ls,
                references: map[RWLayer]*referencedRWLayer{},
        }

        if parent != "" {
                p, err := ls.loadLayer(parent)
                if err != nil {
                        return err
                }
                ml.parent = p

                p.referenceCount++
        }

        ls.mounts[ml.name] = ml

        return nil
}

func (ls *layerStore) applyTar(tx *fileMetadataTransaction, ts io.Reader, parent string, layer *roLayer) error {
        tsw, err := tx.TarSplitWriter(true)
        if err != nil {
                return err
        }
        metaPacker := storage.NewJSONPacker(tsw)
        defer tsw.Close()

        digester := digest.Canonical.Digester()
        tr := io.TeeReader(ts, digester.Hash())

        // we're passing nil here for the file putter, because the ApplyDiff will
        // handle the extraction of the archive
        rdr, err := asm.NewInputTarStream(tr, metaPacker, nil)
        if err != nil {
                return err
        }

        applySize, err := ls.driver.ApplyDiff(layer.cacheID, parent, rdr)
        // discard trailing data but ensure metadata is picked up to reconstruct stream
        // unconditionally call io.Copy here before checking err to ensure the resources
        // allocated by NewInputTarStream above are always released
        io.Copy(io.Discard, rdr) // ignore error as reader may be closed
        if err != nil {
                return err
        }

        layer.size = applySize
        layer.diffID = digester.Digest()

        log.G(context.TODO()).Debugf("Applied tar %s to %s, size: %d", layer.diffID, layer.cacheID, applySize)

        return nil
}

func (ls *layerStore) Register(ts io.Reader, parent ChainID) (Layer, error) {
        return ls.registerWithDescriptor(ts, parent, distribution.Descriptor{})
}

func (ls *layerStore) registerWithDescriptor(ts io.Reader, parent ChainID, descriptor distribution.Descriptor) (Layer, error) {
        // cErr is used to hold the error which will always trigger
        // cleanup of creates sources but may not be an error returned
        // to the caller (already exists).
        var cErr error
        var pid string
        var p *roLayer

        if string(parent) != "" {
                ls.layerL.Lock()
                p = ls.get(parent)
                ls.layerL.Unlock()
                if p == nil {
                        return nil, ErrLayerDoesNotExist
                }
                pid = p.cacheID
                // Release parent chain if error
                defer func() {
                        if cErr != nil {
                                ls.layerL.Lock()
                                ls.releaseLayer(p)
                                ls.layerL.Unlock()
                        }
                }()
                if p.depth() >= maxLayerDepth {
                        cErr = ErrMaxDepthExceeded
                        return nil, cErr
                }
        }

        // Create new roLayer
        layer := &roLayer{
                parent:         p,
                cacheID:        stringid.GenerateRandomID(),
                referenceCount: 1,
                layerStore:     ls,
                references:     map[Layer]struct{}{},
                descriptor:     descriptor,
        }

        if cErr = ls.driver.Create(layer.cacheID, pid, nil); cErr != nil {
                return nil, cErr
        }

        tx, cErr := ls.store.StartTransaction()
        if cErr != nil {
                return nil, cErr
        }

        defer func() {
                if cErr != nil {
                        log.G(context.TODO()).WithFields(log.Fields{"cache-id": layer.cacheID, "error": cErr}).Debug("Cleaning up cache layer after error")
                        if err := ls.driver.Remove(layer.cacheID); err != nil {
                                log.G(context.TODO()).WithFields(log.Fields{"cache-id": layer.cacheID, "error": err}).Error("Error cleaning up cache layer after error")
                        }
                        if err := tx.Cancel(); err != nil {
                                log.G(context.TODO()).WithFields(log.Fields{"cache-id": layer.cacheID, "error": err, "tx": tx.String()}).Error("Error canceling metadata transaction")
                        }
                }
        }()

        if cErr = ls.applyTar(tx, ts, pid, layer); cErr != nil {
                return nil, cErr
        }

        if layer.parent == nil {
                layer.chainID = layer.diffID
        } else {
                layer.chainID = identity.ChainID([]digest.Digest{layer.parent.chainID, layer.diffID})
        }

        if cErr = storeLayer(tx, layer); cErr != nil {
                return nil, cErr
        }

        ls.layerL.Lock()
        defer ls.layerL.Unlock()

        if existingLayer := ls.get(layer.chainID); existingLayer != nil {
                // Set error for cleanup, but do not return the error
                cErr = errors.New("layer already exists")
                return existingLayer.getReference(), nil
        }

        if cErr = tx.Commit(layer.chainID); cErr != nil {
                return nil, cErr
        }

        ls.layerMap[layer.chainID] = layer

        return layer.getReference(), nil
}

func (ls *layerStore) get(layer ChainID) *roLayer {
        l, ok := ls.layerMap[layer]
        if !ok {
                return nil
        }
        l.referenceCount++
        return l
}

func (ls *layerStore) Get(l ChainID) (Layer, error) {
        ls.layerL.Lock()
        defer ls.layerL.Unlock()

        layer := ls.get(l)
        if layer == nil {
                return nil, ErrLayerDoesNotExist
        }

        return layer.getReference(), nil
}

func (ls *layerStore) Map() map[ChainID]Layer {
        ls.layerL.Lock()
        defer ls.layerL.Unlock()

        layers := map[ChainID]Layer{}

        for k, v := range ls.layerMap {
                layers[k] = v
        }

        return layers
}

func (ls *layerStore) deleteLayer(layer *roLayer, metadata *Metadata) error {
        // Rename layer digest folder first so we detect orphan layer(s)
        // if ls.driver.Remove fails
        var dir string
        for {
                tmpID := fmt.Sprintf("%s-%s-removing", layer.chainID.Encoded(), stringid.GenerateRandomID())
                dir = filepath.Join(ls.store.root, string(layer.chainID.Algorithm()), tmpID)
                err := os.Rename(ls.store.getLayerDirectory(layer.chainID), dir)
                if os.IsExist(err) {
                        continue
                }
                break
        }
        err := ls.driver.Remove(layer.cacheID)
        if err != nil {
                return err
        }
        err = os.RemoveAll(dir)
        if err != nil {
                return err
        }
        metadata.DiffID = layer.diffID
        metadata.ChainID = layer.chainID
        metadata.Size = layer.Size()
        metadata.DiffSize = layer.size

        return nil
}

func (ls *layerStore) releaseLayer(l *roLayer) ([]Metadata, error) {
        depth := 0
        removed := []Metadata{}
        for {
                if l.referenceCount == 0 {
                        panic("layer not retained")
                }
                l.referenceCount--
                if l.referenceCount != 0 {
                        return removed, nil
                }

                if len(removed) == 0 && depth > 0 {
                        panic("cannot remove layer with child")
                }
                if l.hasReferences() {
                        panic("cannot delete referenced layer")
                }
                // Remove layer from layer map first so it is not considered to exist
                // when if ls.deleteLayer fails.
                delete(ls.layerMap, l.chainID)

                var metadata Metadata
                if err := ls.deleteLayer(l, &metadata); err != nil {
                        return nil, err
                }
                removed = append(removed, metadata)

                if l.parent == nil {
                        return removed, nil
                }

                depth++
                l = l.parent
        }
}

func (ls *layerStore) Release(l Layer) ([]Metadata, error) {
        ls.layerL.Lock()
        defer ls.layerL.Unlock()
        layer, ok := ls.layerMap[l.ChainID()]
        if !ok {
                return []Metadata{}, nil
        }
        if !layer.hasReference(l) {
                return nil, ErrLayerNotRetained
        }

        layer.deleteReference(l)

        return ls.releaseLayer(layer)
}

func (ls *layerStore) CreateRWLayer(name string, parent ChainID, opts *CreateRWLayerOpts) (_ RWLayer, retErr error) {
        var (
                storageOpt map[string]string
                initFunc   MountInit
                mountLabel string
        )

        if opts != nil {
                mountLabel = opts.MountLabel
                storageOpt = opts.StorageOpt
                initFunc = opts.InitFunc
        }

        ls.locker.Lock(name)
        defer ls.locker.Unlock(name)

        ls.mountL.Lock()
        _, ok := ls.mounts[name]
        ls.mountL.Unlock()
        if ok {
                return nil, ErrMountNameConflict
        }

        var parentID string
        var p *roLayer
        if string(parent) != "" {
                ls.layerL.Lock()
                p = ls.get(parent)
                ls.layerL.Unlock()
                if p == nil {
                        return nil, ErrLayerDoesNotExist
                }
                parentID = p.cacheID

                // Release parent chain if error
                defer func() {
                        if retErr != nil {
                                ls.layerL.Lock()
                                _, _ = ls.releaseLayer(p)
                                ls.layerL.Unlock()
                        }
                }()
        }

        m := &mountedLayer{
                name:       name,
                parent:     p,
                mountID:    ls.mountID(name),
                layerStore: ls,
                references: map[RWLayer]*referencedRWLayer{},
        }

        if initFunc != nil {
                var err error
                parentID, err = ls.initMount(m.mountID, parentID, mountLabel, initFunc, storageOpt)
                if err != nil {
                        return nil, err
                }
                m.initID = parentID
        }

        createOpts := &graphdriver.CreateOpts{
                StorageOpt: storageOpt,
        }

        if err := ls.driver.CreateReadWrite(m.mountID, parentID, createOpts); err != nil {
                return nil, err
        }
        if err := ls.saveMount(m); err != nil {
                return nil, err
        }

        return m.getReference(), nil
}

func (ls *layerStore) GetRWLayer(id string) (RWLayer, error) {
        ls.locker.Lock(id)
        defer ls.locker.Unlock(id)

        ls.mountL.Lock()
        mount := ls.mounts[id]
        ls.mountL.Unlock()
        if mount == nil {
                return nil, ErrMountDoesNotExist
        }

        return mount.getReference(), nil
}

func (ls *layerStore) GetMountID(id string) (string, error) {
        ls.mountL.Lock()
        mount := ls.mounts[id]
        ls.mountL.Unlock()

        if mount == nil {
                return "", ErrMountDoesNotExist
        }
        log.G(context.TODO()).Debugf("GetMountID id: %s -> mountID: %s", id, mount.mountID)

        return mount.mountID, nil
}

func (ls *layerStore) ReleaseRWLayer(l RWLayer) ([]Metadata, error) {
        name := l.Name()
        ls.locker.Lock(name)
        defer ls.locker.Unlock(name)

        ls.mountL.Lock()
        m := ls.mounts[name]
        ls.mountL.Unlock()
        if m == nil {
                return []Metadata{}, nil
        }

        if err := m.deleteReference(l); err != nil {
                return nil, err
        }

        if m.hasReferences() {
                return []Metadata{}, nil
        }

        if err := ls.driver.Remove(m.mountID); err != nil {
                log.G(context.TODO()).Errorf("Error removing mounted layer %s: %s", m.name, err)
                m.retakeReference(l)
                return nil, err
        }

        if m.initID != "" {
                if err := ls.driver.Remove(m.initID); err != nil {
                        log.G(context.TODO()).Errorf("Error removing init layer %s: %s", m.name, err)
                        m.retakeReference(l)
                        return nil, err
                }
        }

        if err := ls.store.RemoveMount(m.name); err != nil {
                log.G(context.TODO()).Errorf("Error removing mount metadata: %s: %s", m.name, err)
                m.retakeReference(l)
                return nil, err
        }

        ls.mountL.Lock()
        delete(ls.mounts, name)
        ls.mountL.Unlock()

        ls.layerL.Lock()
        defer ls.layerL.Unlock()
        if m.parent != nil {
                return ls.releaseLayer(m.parent)
        }

        return []Metadata{}, nil
}

func (ls *layerStore) saveMount(mount *mountedLayer) error {
        if err := ls.store.SetMountID(mount.name, mount.mountID); err != nil {
                return err
        }

        if mount.initID != "" {
                if err := ls.store.SetInitID(mount.name, mount.initID); err != nil {
                        return err
                }
        }

        if mount.parent != nil {
                if err := ls.store.SetMountParent(mount.name, mount.parent.chainID); err != nil {
                        return err
                }
        }

        ls.mountL.Lock()
        ls.mounts[mount.name] = mount
        ls.mountL.Unlock()

        return nil
}

func (ls *layerStore) initMount(graphID, parent, mountLabel string, initFunc MountInit, storageOpt map[string]string) (string, error) {
        // Use "<graph-id>-init" to maintain compatibility with graph drivers
        // which are expecting this layer with this special name. If all
        // graph drivers can be updated to not rely on knowing about this layer
        // then the initID should be randomly generated.
        initID := fmt.Sprintf("%s-init", graphID)

        createOpts := &graphdriver.CreateOpts{
                MountLabel: mountLabel,
                StorageOpt: storageOpt,
        }

        if err := ls.driver.CreateReadWrite(initID, parent, createOpts); err != nil {
                return "", err
        }
        p, err := ls.driver.Get(initID, "")
        if err != nil {
                return "", err
        }

        if err := initFunc(p); err != nil {
                ls.driver.Put(initID)
                return "", err
        }

        if err := ls.driver.Put(initID); err != nil {
                return "", err
        }

        return initID, nil
}

func (ls *layerStore) getTarStream(rl *roLayer) (io.ReadCloser, error) {
        r, err := ls.store.TarSplitReader(rl.chainID)
        if err != nil {
                return nil, err
        }

        pr, pw := io.Pipe()
        go func() {
                err := ls.assembleTarTo(rl.cacheID, r, nil, pw)
                if err != nil {
                        _ = pw.CloseWithError(err)
                } else {
                        _ = pw.Close()
                }
        }()

        return pr, nil
}

func (ls *layerStore) assembleTarTo(graphID string, metadata io.ReadCloser, size *int64, w io.Writer) error {
        diffDriver, ok := ls.driver.(graphdriver.DiffGetterDriver)
        if !ok {
                diffDriver = &naiveDiffPathDriver{ls.driver}
        }

        defer metadata.Close()

        // get our relative path to the container
        fileGetCloser, err := diffDriver.DiffGetter(graphID)
        if err != nil {
                return err
        }
        defer fileGetCloser.Close()

        metaUnpacker := storage.NewJSONUnpacker(metadata)
        upackerCounter := &unpackSizeCounter{metaUnpacker, size}
        log.G(context.TODO()).Debugf("Assembling tar data for %s", graphID)
        return asm.WriteOutputTarStream(fileGetCloser, upackerCounter, w)
}

func (ls *layerStore) Cleanup() error {
        orphanLayers, err := ls.store.getOrphan()
        if err != nil {
                log.G(context.TODO()).WithError(err).Error("cannot get orphan layers")
        }
        if len(orphanLayers) > 0 {
                log.G(context.TODO()).Debugf("found %v orphan layers", len(orphanLayers))
        }
        for _, orphan := range orphanLayers {
                log.G(context.TODO()).WithField("cache-id", orphan.cacheID).Debugf("removing orphan layer, chain ID: %v", orphan.chainID)
                err = ls.driver.Remove(orphan.cacheID)
                if err != nil && !os.IsNotExist(err) {
                        log.G(context.TODO()).WithError(err).WithField("cache-id", orphan.cacheID).Error("cannot remove orphan layer")
                        continue
                }
                err = ls.store.Remove(orphan.chainID, orphan.cacheID)
                if err != nil {
                        log.G(context.TODO()).WithError(err).WithField("chain-id", orphan.chainID).Error("cannot remove orphan layer metadata")
                }
        }
        return ls.driver.Cleanup()
}

func (ls *layerStore) DriverStatus() [][2]string {
        return ls.driver.Status()
}

func (ls *layerStore) DriverName() string {
        return ls.driver.String()
}

type naiveDiffPathDriver struct {
        graphdriver.Driver
}

type fileGetPutter struct {
        storage.FileGetter
        driver graphdriver.Driver
        id     string
}

func (w *fileGetPutter) Close() error {
        return w.driver.Put(w.id)
}

func (n *naiveDiffPathDriver) DiffGetter(id string) (graphdriver.FileGetCloser, error) {
        p, err := n.Driver.Get(id, "")
        if err != nil {
                return nil, err
        }
        return &fileGetPutter{storage.NewPathFileGetter(p), n.Driver, id}, nil
}

//go:build linux || freebsd || darwin || openbsd

package layer

import "github.com/docker/docker/pkg/stringid"

func (ls *layerStore) mountID(name string) string {
        return stringid.GenerateRandomID()
}

package layer

import (
        "compress/gzip"
        "context"
        "errors"
        "io"
        "os"

        "github.com/containerd/log"
        "github.com/opencontainers/go-digest"
        "github.com/opencontainers/image-spec/identity"
        "github.com/vbatts/tar-split/tar/asm"
        "github.com/vbatts/tar-split/tar/storage"
)

func (ls *layerStore) ChecksumForGraphID(id, parent, newTarDataPath string) (diffID DiffID, size int64, _ error) {
        rawArchive, err := ls.driver.Diff(id, parent)
        if err != nil {
                return "", 0, err
        }
        defer rawArchive.Close()

        f, err := os.Create(newTarDataPath)
        if err != nil {
                return "", 0, err
        }
        defer f.Close()
        mfz := gzip.NewWriter(f)
        defer mfz.Close()
        metaPacker := storage.NewJSONPacker(mfz)

        packerCounter := &packSizeCounter{metaPacker, &size}

        archive, err := asm.NewInputTarStream(rawArchive, packerCounter, nil)
        if err != nil {
                return "", 0, err
        }
        dgst, err := digest.FromReader(archive)
        if err != nil {
                return "", 0, err
        }
        return dgst, size, nil
}

func (ls *layerStore) RegisterByGraphID(graphID string, parent ChainID, diffID DiffID, tarDataFile string, size int64) (Layer, error) {
        // cleanupErr is used to hold the error which will always trigger
        // cleanup of creates sources but may not be an error returned
        // to the caller (already exists).
        var cleanupErr error
        var p *roLayer
        if string(parent) != "" {
                ls.layerL.Lock()
                p = ls.get(parent)
                ls.layerL.Unlock()
                if p == nil {
                        return nil, ErrLayerDoesNotExist
                }

                // Release parent chain if error
                defer func() {
                        if cleanupErr != nil {
                                ls.layerL.Lock()
                                _, _ = ls.releaseLayer(p)
                                ls.layerL.Unlock()
                        }
                }()
        }

        var diffIDs []digest.Digest
        if parent != "" {
                diffIDs = append(diffIDs, parent)
        }
        diffIDs = append(diffIDs, diffID)

        // Create new roLayer
        layer := &roLayer{
                parent:         p,
                cacheID:        graphID,
                referenceCount: 1,
                layerStore:     ls,
                references:     map[Layer]struct{}{},
                diffID:         diffID,
                size:           size,
                chainID:        identity.ChainID(diffIDs),
        }

        ls.layerL.Lock()
        defer ls.layerL.Unlock()

        if existingLayer := ls.get(layer.chainID); existingLayer != nil {
                // Set error for cleanup, but do not return
                cleanupErr = errors.New("layer already exists")
                return existingLayer.getReference(), nil
        }

        tx, cleanupErr := ls.store.StartTransaction()
        if cleanupErr != nil {
                return nil, cleanupErr
        }

        defer func() {
                if cleanupErr != nil {
                        log.G(context.TODO()).Debugf("Cleaning up transaction after failed migration for %s: %v", graphID, cleanupErr)
                        if err := tx.Cancel(); err != nil {
                                log.G(context.TODO()).Errorf("Error canceling metadata transaction %q: %s", tx.String(), err)
                        }
                }
        }()

        tsw, cleanupErr := tx.TarSplitWriter(false)
        if cleanupErr != nil {
                return nil, cleanupErr
        }
        defer tsw.Close()
        tdf, cleanupErr := os.Open(tarDataFile)
        if cleanupErr != nil {
                return nil, cleanupErr
        }
        defer tdf.Close()
        _, cleanupErr = io.Copy(tsw, tdf)
        if cleanupErr != nil {
                return nil, cleanupErr
        }

        if cleanupErr = storeLayer(tx, layer); cleanupErr != nil {
                return nil, cleanupErr
        }

        if cleanupErr = tx.Commit(layer.chainID); cleanupErr != nil {
                return nil, cleanupErr
        }

        ls.layerMap[layer.chainID] = layer

        return layer.getReference(), nil
}

type unpackSizeCounter struct {
        unpacker storage.Unpacker
        size     *int64
}

func (u *unpackSizeCounter) Next() (*storage.Entry, error) {
        e, err := u.unpacker.Next()
        if err == nil && u.size != nil {
                *u.size += e.Size
        }
        return e, err
}

type packSizeCounter struct {
        packer storage.Packer
        size   *int64
}

func (p *packSizeCounter) AddEntry(e storage.Entry) (int, error) {
        n, err := p.packer.AddEntry(e)
        if err == nil && p.size != nil {
                *p.size += e.Size
        }
        return n, err
}

package layer

import (
        "io"
        "sync"

        "github.com/moby/go-archive"
)

type mountedLayer struct {
        name       string
        mountID    string
        initID     string
        parent     *roLayer
        layerStore *layerStore

        sync.Mutex
        references map[RWLayer]*referencedRWLayer
}

func (ml *mountedLayer) cacheParent() string {
        if ml.initID != "" {
                return ml.initID
        }
        if ml.parent != nil {
                return ml.parent.cacheID
        }
        return ""
}

func (ml *mountedLayer) TarStream() (io.ReadCloser, error) {
        return ml.layerStore.driver.Diff(ml.mountID, ml.cacheParent())
}

func (ml *mountedLayer) Name() string {
        return ml.name
}

func (ml *mountedLayer) Parent() Layer {
        if ml.parent != nil {
                return ml.parent
        }

        // Return a nil interface instead of an interface wrapping a nil
        // pointer.
        return nil
}

func (ml *mountedLayer) Size() (int64, error) {
        return ml.layerStore.driver.DiffSize(ml.mountID, ml.cacheParent())
}

func (ml *mountedLayer) Changes() ([]archive.Change, error) {
        return ml.layerStore.driver.Changes(ml.mountID, ml.cacheParent())
}

func (ml *mountedLayer) Metadata() (map[string]string, error) {
        m, err := ml.layerStore.driver.GetMetadata(ml.mountID)
        if err != nil {
                return nil, err
        }

        if m == nil {
                m = make(map[string]string)
        }

        if m["ID"] == "" {
                m["ID"] = ml.name
        }

        return m, nil
}

func (ml *mountedLayer) getReference() RWLayer {
        ref := &referencedRWLayer{
                mountedLayer: ml,
        }
        ml.Lock()
        ml.references[ref] = ref
        ml.Unlock()

        return ref
}

func (ml *mountedLayer) hasReferences() bool {
        ml.Lock()
        ret := len(ml.references) > 0
        ml.Unlock()

        return ret
}

func (ml *mountedLayer) deleteReference(ref RWLayer) error {
        ml.Lock()
        defer ml.Unlock()
        if _, ok := ml.references[ref]; !ok {
                return ErrLayerNotRetained
        }
        delete(ml.references, ref)
        return nil
}

func (ml *mountedLayer) retakeReference(r RWLayer) {
        if ref, ok := r.(*referencedRWLayer); ok {
                ml.Lock()
                ml.references[ref] = ref
                ml.Unlock()
        }
}

type referencedRWLayer struct {
        *mountedLayer
}

func (rl *referencedRWLayer) Mount(mountLabel string) (string, error) {
        return rl.layerStore.driver.Get(rl.mountedLayer.mountID, mountLabel)
}

// Unmount decrements the activity count and unmounts the underlying layer
// Callers should only call `Unmount` once per call to `Mount`, even on error.
func (rl *referencedRWLayer) Unmount() error {
        return rl.layerStore.driver.Put(rl.mountedLayer.mountID)
}

// ApplyDiff applies specified diff to the layer
func (rl *referencedRWLayer) ApplyDiff(diff io.Reader) (int64, error) {
        return rl.layerStore.driver.ApplyDiff(rl.mountID, rl.cacheParent(), diff)
}

package layer

import (
        "fmt"
        "io"

        "github.com/docker/distribution"
        "github.com/opencontainers/go-digest"
)

type roLayer struct {
        chainID    ChainID
        diffID     DiffID
        parent     *roLayer
        cacheID    string
        size       int64
        layerStore *layerStore
        descriptor distribution.Descriptor

        referenceCount int
        references     map[Layer]struct{}
}

// TarStream for roLayer guarantees that the data that is produced is the exact
// data that the layer was registered with.
func (rl *roLayer) TarStream() (io.ReadCloser, error) {
        rc, err := rl.layerStore.getTarStream(rl)
        if err != nil {
                return nil, err
        }

        vrc, err := newVerifiedReadCloser(rc, rl.diffID)
        if err != nil {
                return nil, err
        }
        return vrc, nil
}

// TarStreamFrom does not make any guarantees to the correctness of the produced
// data. As such it should not be used when the layer content must be verified
// to be an exact match to the registered layer.
func (rl *roLayer) TarStreamFrom(parent ChainID) (io.ReadCloser, error) {
        var parentCacheID string
        for pl := rl.parent; pl != nil; pl = pl.parent {
                if pl.chainID == parent {
                        parentCacheID = pl.cacheID
                        break
                }
        }

        if parent != "" && parentCacheID == "" {
                return nil, fmt.Errorf("layer ID '%s' is not a parent of the specified layer: cannot provide diff to non-parent", parent)
        }
        return rl.layerStore.driver.Diff(rl.cacheID, parentCacheID)
}

func (rl *roLayer) CacheID() string {
        return rl.cacheID
}

func (rl *roLayer) ChainID() ChainID {
        return rl.chainID
}

func (rl *roLayer) DiffID() DiffID {
        return rl.diffID
}

func (rl *roLayer) Parent() Layer {
        if rl.parent == nil {
                return nil
        }
        return rl.parent
}

func (rl *roLayer) Size() int64 {
        size := rl.size
        if rl.parent != nil {
                size += rl.parent.Size()
        }

        return size
}

func (rl *roLayer) DiffSize() int64 {
        return rl.size
}

func (rl *roLayer) Metadata() (map[string]string, error) {
        return rl.layerStore.driver.GetMetadata(rl.cacheID)
}

type referencedCacheLayer struct {
        *roLayer
}

func (rl *roLayer) getReference() Layer {
        ref := &referencedCacheLayer{
                roLayer: rl,
        }
        rl.references[ref] = struct{}{}

        return ref
}

func (rl *roLayer) hasReference(ref Layer) bool {
        _, ok := rl.references[ref]
        return ok
}

func (rl *roLayer) hasReferences() bool {
        return len(rl.references) > 0
}

func (rl *roLayer) deleteReference(ref Layer) {
        delete(rl.references, ref)
}

func (rl *roLayer) depth() int {
        if rl.parent == nil {
                return 1
        }
        return rl.parent.depth() + 1
}

func storeLayer(tx *fileMetadataTransaction, layer *roLayer) error {
        if err := tx.SetDiffID(layer.diffID); err != nil {
                return err
        }
        if err := tx.SetSize(layer.size); err != nil {
                return err
        }
        if err := tx.SetCacheID(layer.cacheID); err != nil {
                return err
        }
        // Do not store empty descriptors
        if layer.descriptor.Digest != "" {
                if err := tx.SetDescriptor(layer.descriptor); err != nil {
                        return err
                }
        }
        if layer.parent != nil {
                if err := tx.SetParent(layer.parent.chainID); err != nil {
                        return err
                }
        }
        return nil
}

func newVerifiedReadCloser(rc io.ReadCloser, dgst digest.Digest) (io.ReadCloser, error) {
        return &verifiedReadCloser{
                rc:       rc,
                dgst:     dgst,
                verifier: dgst.Verifier(),
        }, nil
}

type verifiedReadCloser struct {
        rc       io.ReadCloser
        dgst     digest.Digest
        verifier digest.Verifier
}

func (vrc *verifiedReadCloser) Read(p []byte) (int, error) {
        n, err := vrc.rc.Read(p)
        if n > 0 {
                if n2, err := vrc.verifier.Write(p[:n]); err != nil {
                        return n2, err
                }
        }
        if err == io.EOF {
                if !vrc.verifier.Verified() {
                        return n, fmt.Errorf("could not verify layer data for: %s. This may be because internal files in the layer store were modified. Re-pulling or rebuilding this image may resolve the issue", vrc.dgst)
                }
        }
        return n, err
}

func (vrc *verifiedReadCloser) Close() error {
        return vrc.rc.Close()
}

package caps

// DefaultCapabilities returns a Linux kernel default capabilities
func DefaultCapabilities() []string {
        return []string{
                "CAP_CHOWN",
                "CAP_DAC_OVERRIDE",
                "CAP_FSETID",
                "CAP_FOWNER",
                "CAP_MKNOD",
                "CAP_NET_RAW",
                "CAP_SETGID",
                "CAP_SETUID",
                "CAP_SETFCAP",
                "CAP_SETPCAP",
                "CAP_NET_BIND_SERVICE",
                "CAP_SYS_CHROOT",
                "CAP_KILL",
                "CAP_AUDIT_WRITE",
        }
}

package caps

import (
        "fmt"
        "strings"

        "github.com/docker/docker/errdefs"
)

var (
        allCaps []string

        // knownCapabilities is a map of all known capabilities, using capability
        // name as index. Nil values indicate that the capability is known, but either
        // not supported by the Kernel, or not available in the current environment,
        // for example, when running Docker-in-Docker with restricted capabilities.
        //
        // Capabilities are one of the security systems in Linux Security Module (LSM)
        // framework provided by the kernel.
        // For more details on capabilities, see http://man7.org/linux/man-pages/man7/capabilities.7.html
        knownCaps map[string]*struct{}
)

// GetAllCapabilities returns all capabilities that are available in the current
// environment.
func GetAllCapabilities() []string {
        initCaps()
        return allCaps
}

// knownCapabilities returns a map of all known capabilities, using capability
// name as index. Nil values indicate that the capability is known, but either
// not supported by the Kernel, or not available in the current environment, for
// example, when running Docker-in-Docker with restricted capabilities.
func knownCapabilities() map[string]*struct{} {
        initCaps()
        return knownCaps
}

// inSlice tests whether a string is contained in a slice of strings or not.
func inSlice(slice []string, s string) bool {
        for _, ss := range slice {
                if s == ss {
                        return true
                }
        }
        return false
}

const allCapabilities = "ALL"

// NormalizeLegacyCapabilities normalizes, and validates CapAdd/CapDrop capabilities
// by upper-casing them, and adding a CAP_ prefix (if not yet present).
//
// This function also accepts the "ALL" magic-value, that's used by CapAdd/CapDrop.
func NormalizeLegacyCapabilities(caps []string) ([]string, error) {
        var (
                normalized     []string
                capabilityList = knownCapabilities()
        )

        for _, c := range caps {
                c = strings.ToUpper(c)
                if c == allCapabilities {
                        normalized = append(normalized, c)
                        continue
                }
                if !strings.HasPrefix(c, "CAP_") {
                        c = "CAP_" + c
                }
                if v, ok := capabilityList[c]; !ok {
                        return nil, errdefs.InvalidParameter(fmt.Errorf("unknown capability: %q", c))
                } else if v == nil {
                        return nil, errdefs.InvalidParameter(fmt.Errorf("capability not supported by your kernel or not available in the current environment: %q", c))
                }
                normalized = append(normalized, c)
        }
        return normalized, nil
}

// TweakCapabilities tweaks capabilities by adding, dropping, or overriding
// capabilities in the basics capabilities list. All capabilities are added
// if privileged is true.
func TweakCapabilities(basics, adds, drops []string, privileged bool) ([]string, error) {
        switch {
        case privileged:
                // Privileged containers get all capabilities
                return GetAllCapabilities(), nil
        case len(adds) == 0 && len(drops) == 0:
                // Nothing to tweak; we're done
                return basics, nil
        }

        capDrop, err := NormalizeLegacyCapabilities(drops)
        if err != nil {
                return nil, err
        }
        capAdd, err := NormalizeLegacyCapabilities(adds)
        if err != nil {
                return nil, err
        }

        var caps []string

        switch {
        case inSlice(capAdd, allCapabilities):
                // Add all capabilities except ones on capDrop
                for _, c := range GetAllCapabilities() {
                        if !inSlice(capDrop, c) {
                                caps = append(caps, c)
                        }
                }
        case inSlice(capDrop, allCapabilities):
                // "Drop" all capabilities; use what's in capAdd instead
                caps = capAdd
        default:
                // First drop some capabilities
                for _, c := range basics {
                        if !inSlice(capDrop, c) {
                                caps = append(caps, c)
                        }
                }
                // Then add the list of capabilities from capAdd
                caps = append(caps, capAdd...)
        }
        return caps, nil
}

package caps

import (
        "context"
        "sync"

        ccaps "github.com/containerd/containerd/v2/pkg/cap"
        "github.com/containerd/log"
)

var initCapsOnce sync.Once

func initCaps() {
        initCapsOnce.Do(func() {
                rawCaps := ccaps.Known()
                curCaps, err := ccaps.Current()
                if err != nil {
                        log.G(context.TODO()).WithError(err).Error("failed to get capabilities from current environment")
                        allCaps = rawCaps
                } else {
                        allCaps = curCaps
                }
                knownCaps = make(map[string]*struct{}, len(rawCaps))
                for _, capName := range rawCaps {
                        // For now, we assume the capability is available if we failed to
                        // get the capabilities from the current environment. This keeps the
                        // old (pre-detection) behavior, and prevents creating containers with
                        // no capabilities. The OCI runtime or kernel may still refuse capa-
                        // bilities that are not available, and produce an error in that case.
                        if len(curCaps) > 0 && !inSlice(curCaps, capName) {
                                knownCaps[capName] = nil
                                continue
                        }
                        knownCaps[capName] = &struct{}{}
                }
        })
}

// TODO(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package oci

import (
        "fmt"
        "os"
        "runtime"
        "sync"

        "github.com/docker/docker/internal/platform"
        "github.com/docker/docker/oci/caps"
        "github.com/opencontainers/runtime-spec/specs-go"
)

func iPtr(i int64) *int64 { return &i }

const defaultUnixPathEnv = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"

// DefaultPathEnv is unix style list of directories to search for
// executables. Each directory is separated from the next by a colon
// ':' character .
// For Windows containers, an empty string is returned as the default
// path will be set by the container, and Docker has no context of what the
// default path should be.
//
// TODO(thaJeztah) align Windows default with BuildKit; see https://github.com/moby/buildkit/pull/1747
// TODO(thaJeztah) use defaults from containerd (but align it with BuildKit; see https://github.com/moby/buildkit/pull/1747)
func DefaultPathEnv(os string) string {
        if os == "windows" {
                return ""
        }
        return defaultUnixPathEnv
}

// DefaultSpec returns the default spec used by docker for the current Platform
func DefaultSpec() specs.Spec {
        if runtime.GOOS == "windows" {
                return DefaultWindowsSpec()
        }
        return DefaultLinuxSpec()
}

// DefaultWindowsSpec create a default spec for running Windows containers
func DefaultWindowsSpec() specs.Spec {
        return specs.Spec{
                Version: specs.Version,
                Windows: &specs.Windows{},
                Process: &specs.Process{},
                Root:    &specs.Root{},
        }
}

// DefaultLinuxSpec create a default spec for running Linux containers
func DefaultLinuxSpec() specs.Spec {
        return specs.Spec{
                Version: specs.Version,
                Process: &specs.Process{
                        Capabilities: &specs.LinuxCapabilities{
                                Bounding:  caps.DefaultCapabilities(),
                                Permitted: caps.DefaultCapabilities(),
                                Effective: caps.DefaultCapabilities(),
                        },
                },
                Root: &specs.Root{},
                Mounts: []specs.Mount{
                        {
                                Destination: "/proc",
                                Type:        "proc",
                                Source:      "proc",
                                Options:     []string{"nosuid", "noexec", "nodev"},
                        },
                        {
                                Destination: "/dev",
                                Type:        "tmpfs",
                                Source:      "tmpfs",
                                Options:     []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
                        },
                        {
                                Destination: "/dev/pts",
                                Type:        "devpts",
                                Source:      "devpts",
                                Options:     []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
                        },
                        {
                                Destination: "/sys",
                                Type:        "sysfs",
                                Source:      "sysfs",
                                Options:     []string{"nosuid", "noexec", "nodev", "ro"},
                        },
                        {
                                Destination: "/sys/fs/cgroup",
                                Type:        "cgroup",
                                Source:      "cgroup",
                                Options:     []string{"ro", "nosuid", "noexec", "nodev"},
                        },
                        {
                                Destination: "/dev/mqueue",
                                Type:        "mqueue",
                                Source:      "mqueue",
                                Options:     []string{"nosuid", "noexec", "nodev"},
                        },
                        {
                                Destination: "/dev/shm",
                                Type:        "tmpfs",
                                Source:      "shm",
                                Options:     []string{"nosuid", "noexec", "nodev", "mode=1777"},
                        },
                },
                Linux: &specs.Linux{
                        MaskedPaths: defaultLinuxMaskedPaths(),
                        ReadonlyPaths: []string{
                                "/proc/bus",
                                "/proc/fs",
                                "/proc/irq",
                                "/proc/sys",
                                "/proc/sysrq-trigger",
                        },
                        Namespaces: []specs.LinuxNamespace{
                                {Type: specs.MountNamespace},
                                {Type: specs.NetworkNamespace},
                                {Type: specs.UTSNamespace},
                                {Type: specs.PIDNamespace},
                                {Type: specs.IPCNamespace},
                        },
                        // Devices implicitly contains the following devices:
                        // null, zero, full, random, urandom, tty, console, and ptmx.
                        // ptmx is a bind mount or symlink of the container's ptmx.
                        // See also: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#default-devices
                        Devices: []specs.LinuxDevice{},
                        Resources: &specs.LinuxResources{
                                Devices: []specs.LinuxDeviceCgroup{
                                        {
                                                Allow:  false,
                                                Access: "rwm",
                                        },
                                        {
                                                Allow:  true,
                                                Type:   "c",
                                                Major:  iPtr(1),
                                                Minor:  iPtr(5),
                                                Access: "rwm",
                                        },
                                        {
                                                Allow:  true,
                                                Type:   "c",
                                                Major:  iPtr(1),
                                                Minor:  iPtr(3),
                                                Access: "rwm",
                                        },
                                        {
                                                Allow:  true,
                                                Type:   "c",
                                                Major:  iPtr(1),
                                                Minor:  iPtr(9),
                                                Access: "rwm",
                                        },
                                        {
                                                Allow:  true,
                                                Type:   "c",
                                                Major:  iPtr(1),
                                                Minor:  iPtr(8),
                                                Access: "rwm",
                                        },
                                        {
                                                Allow:  true,
                                                Type:   "c",
                                                Major:  iPtr(5),
                                                Minor:  iPtr(0),
                                                Access: "rwm",
                                        },
                                        {
                                                Allow:  true,
                                                Type:   "c",
                                                Major:  iPtr(5),
                                                Minor:  iPtr(1),
                                                Access: "rwm",
                                        },
                                        {
                                                Allow:  false,
                                                Type:   "c",
                                                Major:  iPtr(10),
                                                Minor:  iPtr(229),
                                                Access: "rwm",
                                        },
                                },
                        },
                },
        }
}

// defaultLinuxMaskedPaths returns the default list of paths to mask in a Linux
// container. The paths won't change while the docker daemon is running, so just
// compute them once.
var defaultLinuxMaskedPaths = sync.OnceValue(func() []string {
        maskedPaths := []string{
                "/proc/acpi",
                "/proc/asound",
                "/proc/interrupts", // https://github.com/moby/moby/security/advisories/GHSA-6fw5-f8r9-fgfm
                "/proc/kcore",
                "/proc/keys",
                "/proc/latency_stats",
                "/proc/sched_debug",
                "/proc/scsi",
                "/proc/timer_list",
                "/proc/timer_stats",
                "/sys/devices/virtual/powercap", // https://github.com/moby/moby/security/advisories/GHSA-jq35-85cj-fj4p
                "/sys/firmware",
        }

        // https://github.com/moby/moby/security/advisories/GHSA-6fw5-f8r9-fgfm
        cpus := platform.PossibleCPU()
        for _, cpu := range cpus {
                path := fmt.Sprintf("/sys/devices/system/cpu/cpu%d/thermal_throttle", cpu)
                if _, err := os.Stat(path); err == nil {
                        maskedPaths = append(maskedPaths, path)
                }
        }
        return maskedPaths
})

package oci

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "strings"

        coci "github.com/containerd/containerd/v2/pkg/oci"
        "github.com/opencontainers/runtime-spec/specs-go"
)

func deviceCgroup(d *specs.LinuxDevice, permissions string) specs.LinuxDeviceCgroup {
        return specs.LinuxDeviceCgroup{
                Allow:  true,
                Type:   d.Type,
                Major:  &d.Major,
                Minor:  &d.Minor,
                Access: permissions,
        }
}

// DevicesFromPath computes a list of devices and device permissions from paths (pathOnHost and pathInContainer) and cgroup permissions.
func DevicesFromPath(pathOnHost, pathInContainer, cgroupPermissions string) (devs []specs.LinuxDevice, devPermissions []specs.LinuxDeviceCgroup, _ error) {
        resolvedPathOnHost := pathOnHost

        // check if it is a symbolic link
        if src, e := os.Lstat(pathOnHost); e == nil && src.Mode()&os.ModeSymlink == os.ModeSymlink {
                if linkedPathOnHost, e := filepath.EvalSymlinks(pathOnHost); e == nil {
                        resolvedPathOnHost = linkedPathOnHost
                }
        }

        device, err := coci.DeviceFromPath(resolvedPathOnHost)
        // if there was no error, return the device
        if err == nil {
                device.Path = pathInContainer
                return append(devs, *device), append(devPermissions, deviceCgroup(device, cgroupPermissions)), nil
        }

        // if the device is not a device node
        // try to see if it's a directory holding many devices
        if errors.Is(err, coci.ErrNotADevice) {
                // check if it is a directory
                if src, e := os.Stat(resolvedPathOnHost); e == nil && src.IsDir() {
                        // mount the internal devices recursively
                        // TODO check if additional errors should be handled or logged
                        _ = filepath.WalkDir(resolvedPathOnHost, func(dpath string, f os.DirEntry, _ error) error {
                                childDevice, e := coci.DeviceFromPath(dpath)
                                if e != nil {
                                        // ignore the device
                                        return nil
                                }

                                // add the device to userSpecified devices
                                childDevice.Path = strings.Replace(dpath, resolvedPathOnHost, pathInContainer, 1)
                                devs = append(devs, *childDevice)
                                devPermissions = append(devPermissions, deviceCgroup(childDevice, cgroupPermissions))

                                return nil
                        })
                }
        }

        if len(devs) > 0 {
                return devs, devPermissions, nil
        }

        return devs, devPermissions, fmt.Errorf("error gathering device information while adding custom device %q: %s", pathOnHost, err)
}

package oci

import "github.com/opencontainers/runtime-spec/specs-go"

// RemoveNamespace removes the `nsType` namespace from OCI spec `s`
func RemoveNamespace(s *specs.Spec, nsType specs.LinuxNamespaceType) {
        if s.Linux == nil {
                return
        }
        for i, n := range s.Linux.Namespaces {
                if n.Type == nsType {
                        s.Linux.Namespaces = append(s.Linux.Namespaces[:i], s.Linux.Namespaces[i+1:]...)
                        return
                }
        }
}

// NamespacePath returns the configured Path of the first namespace in
// s.Linux.Namespaces of type nsType.
func NamespacePath(s *specs.Spec, nsType specs.LinuxNamespaceType) (path string, ok bool) {
        for _, n := range s.Linux.Namespaces {
                if n.Type == nsType {
                        return n.Path, true
                }
        }
        return "", false
}

package oci

import (
        "fmt"
        "strconv"

        "github.com/docker/docker/internal/lazyregexp"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// TODO verify if this regex is correct for "a" (all);
//
// The docs (https://github.com/torvalds/linux/blob/v5.10/Documentation/admin-guide/cgroup-v1/devices.rst) describe:
// "'all' means it applies to all types and all major and minor numbers", and shows an example
// that *only* passes `a` as value: `echo a > /sys/fs/cgroup/1/devices.allow, which would be
// the "implicit" equivalent of "a *:* rwm". Source-code also looks to confirm this, and returns
// early for "a" (all); https://github.com/torvalds/linux/blob/v5.10/security/device_cgroup.c#L614-L642
var deviceCgroupRuleRegex = lazyregexp.New("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")

// SetCapabilities sets the provided capabilities on the spec.
//
// Deprecated: this function is no longer used and will be removed in the next release.
func SetCapabilities(s *specs.Spec, caplist []string) error {
        if s.Process == nil {
                s.Process = &specs.Process{}
        }
        s.Process.Capabilities = &specs.LinuxCapabilities{
                Effective: caplist,
                Bounding:  caplist,
                Permitted: caplist,
        }
        return nil
}

// AppendDevicePermissionsFromCgroupRules takes rules for the devices cgroup to append to the default set
func AppendDevicePermissionsFromCgroupRules(devPermissions []specs.LinuxDeviceCgroup, rules []string) ([]specs.LinuxDeviceCgroup, error) {
        for _, deviceCgroupRule := range rules {
                ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
                if len(ss) == 0 || len(ss[0]) != 5 {
                        return nil, fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
                }
                matches := ss[0]

                dPermissions := specs.LinuxDeviceCgroup{
                        Allow:  true,
                        Type:   matches[1],
                        Access: matches[4],
                }
                if matches[2] == "*" {
                        major := int64(-1)
                        dPermissions.Major = &major
                } else {
                        major, err := strconv.ParseInt(matches[2], 10, 64)
                        if err != nil {
                                return nil, fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
                        }
                        dPermissions.Major = &major
                }
                if matches[3] == "*" {
                        minor := int64(-1)
                        dPermissions.Minor = &minor
                } else {
                        minor, err := strconv.ParseInt(matches[3], 10, 64)
                        if err != nil {
                                return nil, fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
                        }
                        dPermissions.Minor = &minor
                }
                devPermissions = append(devPermissions, dPermissions)
        }
        return devPermissions, nil
}

package authorization

import (
        "crypto/x509"
        "encoding/json"
        "encoding/pem"
)

const (
        // AuthZApiRequest is the url for daemon request authorization
        AuthZApiRequest = "AuthZPlugin.AuthZReq"

        // AuthZApiResponse is the url for daemon response authorization
        AuthZApiResponse = "AuthZPlugin.AuthZRes"

        // AuthZApiImplements is the name of the interface all AuthZ plugins implement
        AuthZApiImplements = "authz"
)

// PeerCertificate is a wrapper around x509.Certificate which provides a sane
// encoding/decoding to/from PEM format and JSON.
type PeerCertificate x509.Certificate

// MarshalJSON returns the JSON encoded pem bytes of a PeerCertificate.
func (pc *PeerCertificate) MarshalJSON() ([]byte, error) {
        b := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: pc.Raw})
        return json.Marshal(b)
}

// UnmarshalJSON populates a new PeerCertificate struct from JSON data.
func (pc *PeerCertificate) UnmarshalJSON(b []byte) error {
        var buf []byte
        if err := json.Unmarshal(b, &buf); err != nil {
                return err
        }
        derBytes, _ := pem.Decode(buf)
        c, err := x509.ParseCertificate(derBytes.Bytes)
        if err != nil {
                return err
        }
        *pc = PeerCertificate(*c)
        return nil
}

// Request holds data required for authZ plugins
type Request struct {
        // User holds the user extracted by AuthN mechanism
        User string `json:"User,omitempty"`

        // UserAuthNMethod holds the mechanism used to extract user details (e.g., krb)
        UserAuthNMethod string `json:"UserAuthNMethod,omitempty"`

        // RequestMethod holds the HTTP method (GET/POST/PUT)
        RequestMethod string `json:"RequestMethod,omitempty"`

        // RequestUri holds the full HTTP uri (e.g., /v1.21/version)
        RequestURI string `json:"RequestUri,omitempty"`

        // RequestBody stores the raw request body sent to the docker daemon
        RequestBody []byte `json:"RequestBody,omitempty"`

        // RequestHeaders stores the raw request headers sent to the docker daemon
        RequestHeaders map[string]string `json:"RequestHeaders,omitempty"`

        // RequestPeerCertificates stores the request's TLS peer certificates in PEM format
        RequestPeerCertificates []*PeerCertificate `json:"RequestPeerCertificates,omitempty"`

        // ResponseStatusCode stores the status code returned from docker daemon
        ResponseStatusCode int `json:"ResponseStatusCode,omitempty"`

        // ResponseBody stores the raw response body sent from docker daemon
        ResponseBody []byte `json:"ResponseBody,omitempty"`

        // ResponseHeaders stores the response headers sent to the docker daemon
        ResponseHeaders map[string]string `json:"ResponseHeaders,omitempty"`
}

// Response represents authZ plugin response
type Response struct {
        // Allow indicating whether the user is allowed or not
        Allow bool `json:"Allow"`

        // Msg stores the authorization message
        Msg string `json:"Msg,omitempty"`

        // Err stores a message in case there's an error
        Err string `json:"Err,omitempty"`
}

package authorization

import (
        "bufio"
        "bytes"
        "context"
        "fmt"
        "io"
        "mime"
        "net/http"
        "net/url"
        "regexp"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/ioutils"
)

const maxBodySize = 1048576 // 1MB

// NewCtx creates new authZ context, it is used to store authorization information related to a specific docker
// REST http session
// A context provides two method:
// Authenticate Request:
// Call authZ plugins with current REST request and AuthN response
// Request contains full HTTP packet sent to the docker daemon
// https://docs.docker.com/reference/api/engine/
//
// Authenticate Response:
// Call authZ plugins with full info about current REST request, REST response and AuthN response
// The response from this method may contains content that overrides the daemon response
// This allows authZ plugins to filter privileged content
//
// If multiple authZ plugins are specified, the block/allow decision is based on ANDing all plugin results
// For response manipulation, the response from each plugin is piped between plugins. Plugin execution order
// is determined according to daemon parameters
func NewCtx(authZPlugins []Plugin, user, userAuthNMethod, requestMethod, requestURI string) *Ctx {
        return &Ctx{
                plugins:         authZPlugins,
                user:            user,
                userAuthNMethod: userAuthNMethod,
                requestMethod:   requestMethod,
                requestURI:      requestURI,
        }
}

// Ctx stores a single request-response interaction context
type Ctx struct {
        user            string
        userAuthNMethod string
        requestMethod   string
        requestURI      string
        plugins         []Plugin
        // authReq stores the cached request object for the current transaction
        authReq *Request
}

func isChunked(r *http.Request) bool {
        // RFC 7230 specifies that content length is to be ignored if Transfer-Encoding is chunked
        if strings.EqualFold(r.Header.Get("Transfer-Encoding"), "chunked") {
                return true
        }
        for _, v := range r.TransferEncoding {
                if strings.EqualFold(v, "chunked") {
                        return true
                }
        }
        return false
}

// AuthZRequest authorized the request to the docker daemon using authZ plugins
func (ctx *Ctx) AuthZRequest(w http.ResponseWriter, r *http.Request) error {
        var body []byte
        if sendBody(ctx.requestURI, r.Header) && (r.ContentLength > 0 || isChunked(r)) && r.ContentLength < maxBodySize {
                var err error
                body, r.Body, err = drainBody(r.Body)
                if err != nil {
                        return err
                }
        }

        var h bytes.Buffer
        if err := r.Header.Write(&h); err != nil {
                return err
        }

        ctx.authReq = &Request{
                User:            ctx.user,
                UserAuthNMethod: ctx.userAuthNMethod,
                RequestMethod:   ctx.requestMethod,
                RequestURI:      ctx.requestURI,
                RequestBody:     body,
                RequestHeaders:  headers(r.Header),
        }

        if r.TLS != nil {
                for _, c := range r.TLS.PeerCertificates {
                        pc := PeerCertificate(*c)
                        ctx.authReq.RequestPeerCertificates = append(ctx.authReq.RequestPeerCertificates, &pc)
                }
        }

        for _, plugin := range ctx.plugins {
                log.G(context.TODO()).Debugf("AuthZ request using plugin %s", plugin.Name())

                authRes, err := plugin.AuthZRequest(ctx.authReq)
                if err != nil {
                        return fmt.Errorf("plugin %s failed with error: %s", plugin.Name(), err)
                }

                if !authRes.Allow {
                        return newAuthorizationError(plugin.Name(), authRes.Msg)
                }
        }

        return nil
}

// AuthZResponse authorized and manipulates the response from docker daemon using authZ plugins
func (ctx *Ctx) AuthZResponse(rm ResponseModifier, r *http.Request) error {
        ctx.authReq.ResponseStatusCode = rm.StatusCode()
        ctx.authReq.ResponseHeaders = headers(rm.Header())

        if sendBody(ctx.requestURI, rm.Header()) {
                ctx.authReq.ResponseBody = rm.RawBody()
        }
        for _, plugin := range ctx.plugins {
                log.G(context.TODO()).Debugf("AuthZ response using plugin %s", plugin.Name())

                authRes, err := plugin.AuthZResponse(ctx.authReq)
                if err != nil {
                        return fmt.Errorf("plugin %s failed with error: %s", plugin.Name(), err)
                }

                if !authRes.Allow {
                        return newAuthorizationError(plugin.Name(), authRes.Msg)
                }
        }

        rm.FlushAll()

        return nil
}

// drainBody dump the body (if its length is less than 1MB) without modifying the request state
func drainBody(body io.ReadCloser) ([]byte, io.ReadCloser, error) {
        bufReader := bufio.NewReaderSize(body, maxBodySize)
        newBody := ioutils.NewReadCloserWrapper(bufReader, func() error { return body.Close() })

        data, err := bufReader.Peek(maxBodySize)
        // Body size exceeds max body size
        if err == nil {
                log.G(context.TODO()).Warnf("Request body is larger than: '%d' skipping body", maxBodySize)
                return nil, newBody, nil
        }
        // Body size is less than maximum size
        if err == io.EOF {
                return data, newBody, nil
        }
        // Unknown error
        return nil, newBody, err
}

func isAuthEndpoint(urlPath string) (bool, error) {
        // eg www.test.com/v1.24/auth/optional?optional1=something&optional2=something (version optional)
        matched, err := regexp.MatchString(`^[^\/]*\/(v\d[\d\.]*\/)?auth.*`, urlPath)
        if err != nil {
                return false, err
        }
        return matched, nil
}

// sendBody returns true when request/response body should be sent to AuthZPlugin
func sendBody(inURL string, header http.Header) bool {
        u, err := url.Parse(inURL)
        // Assume no if the URL cannot be parsed - an empty request will still be forwarded to the plugin and should be rejected
        if err != nil {
                return false
        }

        // Skip body for auth endpoint
        isAuth, err := isAuthEndpoint(u.Path)
        if isAuth || err != nil {
                return false
        }

        // body is sent only for text or json messages
        contentType, _, err := mime.ParseMediaType(header.Get("Content-Type"))
        if err != nil {
                return false
        }

        return contentType == "application/json"
}

// headers returns flatten version of the http headers excluding authorization
func headers(header http.Header) map[string]string {
        v := make(map[string]string)
        for k, values := range header {
                // Skip authorization headers
                if strings.EqualFold(k, "Authorization") || strings.EqualFold(k, "X-Registry-Config") || strings.EqualFold(k, "X-Registry-Auth") {
                        continue
                }
                for _, val := range values {
                        v[k] = val
                }
        }
        return v
}

// authorizationError represents an authorization deny error
type authorizationError struct {
        error
}

func (authorizationError) Forbidden() {}

func newAuthorizationError(plugin, msg string) authorizationError {
        return authorizationError{error: fmt.Errorf("authorization denied by plugin %s: %s", plugin, msg)}
}

package authorization

import (
        "context"
        "net/http"
        "sync"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/plugingetter"
)

// Middleware uses a list of plugins to
// handle authorization in the API requests.
type Middleware struct {
        mu      sync.Mutex
        plugins []Plugin
}

// NewMiddleware creates a new Middleware
// with a slice of plugins names.
func NewMiddleware(names []string, pg plugingetter.PluginGetter) *Middleware {
        SetPluginGetter(pg)
        return &Middleware{
                plugins: newPlugins(names),
        }
}

func (m *Middleware) getAuthzPlugins() []Plugin {
        m.mu.Lock()
        defer m.mu.Unlock()
        return m.plugins
}

// SetPlugins sets the plugin used for authorization
func (m *Middleware) SetPlugins(names []string) {
        m.mu.Lock()
        m.plugins = newPlugins(names)
        m.mu.Unlock()
}

// RemovePlugin removes a single plugin from this authz middleware chain
func (m *Middleware) RemovePlugin(name string) {
        m.mu.Lock()
        defer m.mu.Unlock()
        plugins := m.plugins[:0]
        for _, authPlugin := range m.plugins {
                if authPlugin.Name() != name {
                        plugins = append(plugins, authPlugin)
                }
        }
        m.plugins = plugins
}

// WrapHandler returns a new handler function wrapping the previous one in the request chain.
func (m *Middleware) WrapHandler(handler func(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error) func(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
        return func(ctx context.Context, w http.ResponseWriter, r *http.Request, vars map[string]string) error {
                plugins := m.getAuthzPlugins()
                if len(plugins) == 0 {
                        return handler(ctx, w, r, vars)
                }

                user := ""
                userAuthNMethod := ""

                // Default authorization using existing TLS connection credentials
                // FIXME: Non trivial authorization mechanisms (such as advanced certificate validations, kerberos support
                // and ldap) will be extracted using AuthN feature, which is tracked under:
                // https://github.com/docker/docker/pull/20883
                if r.TLS != nil && len(r.TLS.PeerCertificates) > 0 {
                        user = r.TLS.PeerCertificates[0].Subject.CommonName
                        userAuthNMethod = "TLS"
                }

                authCtx := NewCtx(plugins, user, userAuthNMethod, r.Method, r.RequestURI)

                if err := authCtx.AuthZRequest(w, r); err != nil {
                        log.G(ctx).Errorf("AuthZRequest for %s %s returned error: %s", r.Method, r.RequestURI, err)
                        return err
                }

                rw := NewResponseModifier(w)

                var errD error

                if errD = handler(ctx, rw, r, vars); errD != nil {
                        log.G(ctx).Errorf("Handler for %s %s returned error: %s", r.Method, r.RequestURI, errD)
                }

                // There's a chance that the authCtx.plugins was updated. One of the reasons
                // this can happen is when an authzplugin is disabled.
                plugins = m.getAuthzPlugins()
                if len(plugins) == 0 {
                        log.G(ctx).Debug("There are no authz plugins in the chain")
                        return nil
                }

                authCtx.plugins = plugins

                if err := authCtx.AuthZResponse(rw, r); errD == nil && err != nil {
                        log.G(ctx).Errorf("AuthZResponse for %s %s returned error: %s", r.Method, r.RequestURI, err)
                        return err
                }

                if errD != nil {
                        return errD
                }

                return nil
        }
}

package authorization

import (
        "sync"

        "github.com/docker/docker/pkg/plugingetter"
        "github.com/docker/docker/pkg/plugins"
)

// Plugin allows third party plugins to authorize requests and responses
// in the context of docker API
type Plugin interface {
        // Name returns the registered plugin name
        Name() string

        // AuthZRequest authorizes the request from the client to the daemon
        AuthZRequest(*Request) (*Response, error)

        // AuthZResponse authorizes the response from the daemon to the client
        AuthZResponse(*Request) (*Response, error)
}

// newPlugins constructs and initializes the authorization plugins based on plugin names
func newPlugins(names []string) []Plugin {
        plugins := []Plugin{}
        pluginsMap := make(map[string]struct{})
        for _, name := range names {
                if _, ok := pluginsMap[name]; ok {
                        continue
                }
                pluginsMap[name] = struct{}{}
                plugins = append(plugins, newAuthorizationPlugin(name))
        }
        return plugins
}

var getter plugingetter.PluginGetter

// SetPluginGetter sets the plugingetter
func SetPluginGetter(pg plugingetter.PluginGetter) {
        getter = pg
}

// GetPluginGetter gets the plugingetter
func GetPluginGetter() plugingetter.PluginGetter {
        return getter
}

// authorizationPlugin is an internal adapter to docker plugin system
type authorizationPlugin struct {
        initErr error
        plugin  *plugins.Client
        name    string
        once    sync.Once
}

func newAuthorizationPlugin(name string) Plugin {
        return &authorizationPlugin{name: name}
}

func (a *authorizationPlugin) Name() string {
        return a.name
}

// Set the remote for an authz pluginv2
func (a *authorizationPlugin) SetName(remote string) {
        a.name = remote
}

func (a *authorizationPlugin) AuthZRequest(authReq *Request) (*Response, error) {
        if err := a.initPlugin(); err != nil {
                return nil, err
        }

        authRes := &Response{}
        if err := a.plugin.Call(AuthZApiRequest, authReq, authRes); err != nil {
                return nil, err
        }

        return authRes, nil
}

func (a *authorizationPlugin) AuthZResponse(authReq *Request) (*Response, error) {
        if err := a.initPlugin(); err != nil {
                return nil, err
        }

        authRes := &Response{}
        if err := a.plugin.Call(AuthZApiResponse, authReq, authRes); err != nil {
                return nil, err
        }

        return authRes, nil
}

// initPlugin initializes the authorization plugin if needed
func (a *authorizationPlugin) initPlugin() error {
        // Lazy loading of plugins
        a.once.Do(func() {
                if a.plugin == nil {
                        var plugin plugingetter.CompatPlugin
                        var e error

                        if pg := GetPluginGetter(); pg != nil {
                                plugin, e = pg.Get(a.name, AuthZApiImplements, plugingetter.Lookup)
                                a.SetName(plugin.Name())
                        } else {
                                plugin, e = plugins.Get(a.name, AuthZApiImplements)
                        }
                        if e != nil {
                                a.initErr = e
                                return
                        }
                        a.plugin = plugin.Client()
                }
        })
        return a.initErr
}

package authorization

import (
        "bufio"
        "bytes"
        "context"
        "encoding/json"
        "errors"
        "net"
        "net/http"

        "github.com/containerd/log"
)

// ResponseModifier allows authorization plugins to read and modify the content of the http.response
type ResponseModifier interface {
        http.ResponseWriter
        http.Flusher

        // RawBody returns the current http content
        RawBody() []byte

        // RawHeaders returns the current content of the http headers
        RawHeaders() ([]byte, error)

        // StatusCode returns the current status code
        StatusCode() int

        // OverrideBody replaces the body of the HTTP reply
        OverrideBody(b []byte)

        // OverrideHeader replaces the headers of the HTTP reply
        OverrideHeader(b []byte) error

        // OverrideStatusCode replaces the status code of the HTTP reply
        OverrideStatusCode(statusCode int)

        // FlushAll flushes all data to the HTTP response
        FlushAll() error

        // Hijacked indicates the response has been hijacked by the Docker daemon
        Hijacked() bool
}

// NewResponseModifier creates a wrapper to an http.ResponseWriter to allow inspecting and modifying the content
func NewResponseModifier(rw http.ResponseWriter) ResponseModifier {
        return &responseModifier{rw: rw, header: make(http.Header)}
}

const maxBufferSize = 64 * 1024

// responseModifier is used as an adapter to http.ResponseWriter in order to manipulate and explore
// the http request/response from docker daemon
type responseModifier struct {
        // The original response writer
        rw http.ResponseWriter
        // body holds the response body
        body []byte
        // header holds the response header
        header http.Header
        // statusCode holds the response status code
        statusCode int
        // hijacked indicates the request has been hijacked
        hijacked bool
}

func (rm *responseModifier) Hijacked() bool {
        return rm.hijacked
}

// WriteHeader stores the http status code
func (rm *responseModifier) WriteHeader(s int) {
        // Use original request if hijacked
        if rm.hijacked {
                rm.rw.WriteHeader(s)
                return
        }

        rm.statusCode = s
}

// Header returns the internal http header
func (rm *responseModifier) Header() http.Header {
        // Use original header if hijacked
        if rm.hijacked {
                return rm.rw.Header()
        }

        return rm.header
}

// StatusCode returns the http status code
func (rm *responseModifier) StatusCode() int {
        return rm.statusCode
}

// OverrideBody replaces the body of the HTTP response
func (rm *responseModifier) OverrideBody(b []byte) {
        rm.body = b
}

// OverrideStatusCode replaces the status code of the HTTP response
func (rm *responseModifier) OverrideStatusCode(statusCode int) {
        rm.statusCode = statusCode
}

// OverrideHeader replaces the headers of the HTTP response
func (rm *responseModifier) OverrideHeader(b []byte) error {
        header := http.Header{}
        if err := json.Unmarshal(b, &header); err != nil {
                return err
        }
        rm.header = header
        return nil
}

// Write stores the byte array inside content
func (rm *responseModifier) Write(b []byte) (int, error) {
        if rm.hijacked {
                return rm.rw.Write(b)
        }

        if len(rm.body)+len(b) > maxBufferSize {
                rm.Flush()
        }
        rm.body = append(rm.body, b...)
        return len(b), nil
}

// Body returns the response body
func (rm *responseModifier) RawBody() []byte {
        return rm.body
}

func (rm *responseModifier) RawHeaders() ([]byte, error) {
        var b bytes.Buffer
        if err := rm.header.Write(&b); err != nil {
                return nil, err
        }
        return b.Bytes(), nil
}

// Hijack returns the internal connection of the wrapped http.ResponseWriter
func (rm *responseModifier) Hijack() (net.Conn, *bufio.ReadWriter, error) {
        rm.hijacked = true
        rm.FlushAll()

        hijacker, ok := rm.rw.(http.Hijacker)
        if !ok {
                return nil, nil, errors.New("Internal response writer doesn't support the Hijacker interface")
        }
        return hijacker.Hijack()
}

// Flush uses the internal flush API of the wrapped http.ResponseWriter
func (rm *responseModifier) Flush() {
        flusher, ok := rm.rw.(http.Flusher)
        if !ok {
                log.G(context.TODO()).Error("Internal response writer doesn't support the Flusher interface")
                return
        }

        rm.FlushAll()
        flusher.Flush()
}

// FlushAll flushes all data to the HTTP response
func (rm *responseModifier) FlushAll() error {
        // Copy the header
        for k, vv := range rm.header {
                for _, v := range vv {
                        rm.rw.Header().Add(k, v)
                }
        }

        // Copy the status code
        // Also WriteHeader needs to be done after all the headers
        // have been copied (above).
        if rm.statusCode > 0 {
                rm.rw.WriteHeader(rm.statusCode)
        }

        var err error
        if len(rm.body) > 0 {
                // Write body
                var n int
                n, err = rm.rw.Write(rm.body)
                // TODO(@cpuguy83): there is now a relatively small buffer limit, instead of discarding our buffer here and
                // allocating again later this should just keep using the same buffer and track the buffer position (like a bytes.Buffer with a fixed size)
                rm.body = rm.body[n:]
        }

        // Clean previous data
        rm.statusCode = 0
        rm.header = http.Header{}
        return err
}

package homedir

import (
        "os"
        "os/user"
        "runtime"
)

// Get returns the home directory of the current user with the help of
// environment variables depending on the target operating system.
// Returned path should be used with "path/filepath" to form new paths.
//
// On non-Windows platforms, it falls back to nss lookups, if the home
// directory cannot be obtained from environment-variables.
//
// If linking statically with cgo enabled against glibc, ensure the
// osusergo build tag is used.
//
// If needing to do nss lookups, do not disable cgo or set osusergo.
func Get() string {
        home, _ := os.UserHomeDir()
        if home == "" && runtime.GOOS != "windows" {
                if u, err := user.Current(); err == nil {
                        return u.HomeDir
                }
        }
        return home
}

package homedir

import (
        "errors"
        "os"
        "path/filepath"
        "strings"
)

// GetRuntimeDir returns XDG_RUNTIME_DIR.
// XDG_RUNTIME_DIR is typically configured via pam_systemd.
// GetRuntimeDir returns non-nil error if XDG_RUNTIME_DIR is not set.
//
// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
func GetRuntimeDir() (string, error) {
        if xdgRuntimeDir := os.Getenv("XDG_RUNTIME_DIR"); xdgRuntimeDir != "" {
                return xdgRuntimeDir, nil
        }
        return "", errors.New("could not get XDG_RUNTIME_DIR")
}

// StickRuntimeDirContents sets the sticky bit on files that are under
// XDG_RUNTIME_DIR, so that the files won't be periodically removed by the system.
//
// StickyRuntimeDir returns slice of sticked files.
// StickyRuntimeDir returns nil error if XDG_RUNTIME_DIR is not set.
//
// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
func StickRuntimeDirContents(files []string) ([]string, error) {
        runtimeDir, err := GetRuntimeDir()
        if err != nil {
                // ignore error if runtimeDir is empty
                return nil, nil
        }
        runtimeDir, err = filepath.Abs(runtimeDir)
        if err != nil {
                return nil, err
        }
        var sticked []string
        for _, f := range files {
                f, err = filepath.Abs(f)
                if err != nil {
                        return sticked, err
                }
                if strings.HasPrefix(f, runtimeDir+"/") {
                        if err = stick(f); err != nil {
                                return sticked, err
                        }
                        sticked = append(sticked, f)
                }
        }
        return sticked, nil
}

func stick(f string) error {
        st, err := os.Stat(f)
        if err != nil {
                return err
        }
        m := st.Mode()
        m |= os.ModeSticky
        return os.Chmod(f, m)
}

// GetDataHome returns XDG_DATA_HOME.
// GetDataHome returns $HOME/.local/share and nil error if XDG_DATA_HOME is not set.
// If HOME and XDG_DATA_HOME are not set, getpwent(3) is consulted to determine the users home directory.
//
// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
func GetDataHome() (string, error) {
        if xdgDataHome := os.Getenv("XDG_DATA_HOME"); xdgDataHome != "" {
                return xdgDataHome, nil
        }
        home := Get()
        if home == "" {
                return "", errors.New("could not get either XDG_DATA_HOME or HOME")
        }
        return filepath.Join(home, ".local", "share"), nil
}

// GetConfigHome returns XDG_CONFIG_HOME.
// GetConfigHome returns $HOME/.config and nil error if XDG_CONFIG_HOME is not set.
// If HOME and XDG_CONFIG_HOME are not set, getpwent(3) is consulted to determine the users home directory.
//
// See also https://standards.freedesktop.org/basedir-spec/latest/ar01s03.html
func GetConfigHome() (string, error) {
        if xdgConfigHome := os.Getenv("XDG_CONFIG_HOME"); xdgConfigHome != "" {
                return xdgConfigHome, nil
        }
        home := Get()
        if home == "" {
                return "", errors.New("could not get either XDG_CONFIG_HOME or HOME")
        }
        return filepath.Join(home, ".config"), nil
}

// GetLibHome returns $HOME/.local/lib
// If HOME is not set, getpwent(3) is consulted to determine the users home directory.
func GetLibHome() (string, error) {
        home := Get()
        if home == "" {
                return "", errors.New("could not get HOME")
        }
        return filepath.Join(home, ".local/lib"), nil
}

package ioutils

import (
        "context"
        "io"
        "runtime/debug"
        "sync/atomic"

        "github.com/containerd/log"
)

// readCloserWrapper wraps an io.Reader, and implements an io.ReadCloser
// It calls the given callback function when closed. It should be constructed
// with NewReadCloserWrapper
type readCloserWrapper struct {
        io.Reader
        closer func() error
        closed atomic.Bool
}

// Close calls back the passed closer function
func (r *readCloserWrapper) Close() error {
        if !r.closed.CompareAndSwap(false, true) {
                subsequentCloseWarn("ReadCloserWrapper")
                return nil
        }
        return r.closer()
}

// NewReadCloserWrapper wraps an io.Reader, and implements an io.ReadCloser.
// It calls the given callback function when closed.
func NewReadCloserWrapper(r io.Reader, closer func() error) io.ReadCloser {
        return &readCloserWrapper{
                Reader: r,
                closer: closer,
        }
}

// cancelReadCloser wraps an io.ReadCloser with a context for cancelling read
// operations.
type cancelReadCloser struct {
        cancel func()
        pR     *io.PipeReader // Stream to read from
        pW     *io.PipeWriter
        closed atomic.Bool
}

// NewCancelReadCloser creates a wrapper that closes the ReadCloser when the
// context is cancelled. The returned io.ReadCloser must be closed when it is
// no longer needed.
func NewCancelReadCloser(ctx context.Context, in io.ReadCloser) io.ReadCloser {
        pR, pW := io.Pipe()

        // Create a context used to signal when the pipe is closed
        doneCtx, cancel := context.WithCancel(context.Background())

        p := &cancelReadCloser{
                cancel: cancel,
                pR:     pR,
                pW:     pW,
        }

        go func() {
                _, err := io.Copy(pW, in)
                select {
                case <-ctx.Done():
                        // If the context was closed, p.closeWithError
                        // was already called. Calling it again would
                        // change the error that Read returns.
                default:
                        p.closeWithError(err)
                }
                in.Close()
        }()
        go func() {
                for {
                        select {
                        case <-ctx.Done():
                                p.closeWithError(ctx.Err())
                        case <-doneCtx.Done():
                                return
                        }
                }
        }()

        return p
}

// Read wraps the Read method of the pipe that provides data from the wrapped
// ReadCloser.
func (p *cancelReadCloser) Read(buf []byte) (int, error) {
        return p.pR.Read(buf)
}

// closeWithError closes the wrapper and its underlying reader. It will
// cause future calls to Read to return err.
func (p *cancelReadCloser) closeWithError(err error) {
        _ = p.pW.CloseWithError(err)
        p.cancel()
}

// Close closes the wrapper its underlying reader. It will cause
// future calls to Read to return io.EOF.
func (p *cancelReadCloser) Close() error {
        if !p.closed.CompareAndSwap(false, true) {
                subsequentCloseWarn("cancelReadCloser")
                return nil
        }
        p.closeWithError(io.EOF)
        return nil
}

func subsequentCloseWarn(name string) {
        log.G(context.TODO()).Error("subsequent attempt to close " + name)
        if log.GetLevel() >= log.DebugLevel {
                log.G(context.TODO()).Errorf("stack trace: %s", string(debug.Stack()))
        }
}

package ioutils

import (
        "io"
        "sync"
)

// WriteFlusher wraps the Write and Flush operation ensuring that every write
// is a flush. In addition, the Close method can be called to intercept
// Read/Write calls if the targets lifecycle has already ended.
type WriteFlusher struct {
        w           io.Writer
        flusher     flusher
        flushed     chan struct{}
        flushedOnce sync.Once
        closed      chan struct{}
        closeLock   sync.Mutex
}

type flusher interface {
        Flush()
}

func (wf *WriteFlusher) Write(b []byte) (int, error) {
        select {
        case <-wf.closed:
                return 0, io.EOF
        default:
        }

        n, err := wf.w.Write(b)
        wf.Flush() // every write is a flush.
        return n, err
}

// Flush the stream immediately.
func (wf *WriteFlusher) Flush() {
        select {
        case <-wf.closed:
                return
        default:
        }

        wf.flushedOnce.Do(func() {
                close(wf.flushed)
        })
        wf.flusher.Flush()
}

// Flushed returns the state of flushed.
// If it's flushed, return true, or else it return false.
func (wf *WriteFlusher) Flushed() bool {
        // BUG(stevvooe): Remove this method. Its use is inherently racy. Seems to
        // be used to detect whether or a response code has been issued or not.
        // Another hook should be used instead.
        var flushed bool
        select {
        case <-wf.flushed:
                flushed = true
        default:
        }
        return flushed
}

// Close closes the write flusher, disallowing any further writes to the
// target. After the flusher is closed, all calls to write or flush will
// result in an error.
func (wf *WriteFlusher) Close() error {
        wf.closeLock.Lock()
        defer wf.closeLock.Unlock()

        select {
        case <-wf.closed:
                return io.EOF
        default:
                close(wf.closed)
        }
        return nil
}

// nopFlusher represents a type which flush operation is nop.
type nopFlusher struct{}

// Flush is a nop operation.
func (f *nopFlusher) Flush() {}

// NewWriteFlusher returns a new WriteFlusher.
func NewWriteFlusher(w io.Writer) *WriteFlusher {
        var fl flusher
        if f, ok := w.(flusher); ok {
                fl = f
        } else {
                fl = &nopFlusher{}
        }
        return &WriteFlusher{w: w, flusher: fl, closed: make(chan struct{}), flushed: make(chan struct{})}
}

package ioutils

import (
        "io"
        "sync/atomic"
)

type writeCloserWrapper struct {
        io.Writer
        closer func() error
        closed atomic.Bool
}

func (r *writeCloserWrapper) Close() error {
        if !r.closed.CompareAndSwap(false, true) {
                subsequentCloseWarn("WriteCloserWrapper")
                return nil
        }
        return r.closer()
}

// NewWriteCloserWrapper returns a new io.WriteCloser.
func NewWriteCloserWrapper(r io.Writer, closer func() error) io.WriteCloser {
        return &writeCloserWrapper{
                Writer: r,
                closer: closer,
        }
}

package jsonmessage

import (
        "encoding/json"
        "fmt"
        "io"
        "strings"
        "time"

        "github.com/docker/go-units"
        "github.com/moby/term"
        "github.com/morikuni/aec"
)

// RFC3339NanoFixed is time.RFC3339Nano with nanoseconds padded using zeros to
// ensure the formatted time isalways the same number of characters.
const RFC3339NanoFixed = "2006-01-02T15:04:05.000000000Z07:00"

// JSONError wraps a concrete Code and Message, Code is
// an integer error code, Message is the error message.
type JSONError struct {
        Code    int    `json:"code,omitempty"`
        Message string `json:"message,omitempty"`
}

func (e *JSONError) Error() string {
        return e.Message
}

// JSONProgress describes a progress message in a JSON stream.
type JSONProgress struct {
        // Current is the current status and value of the progress made towards Total.
        Current int64 `json:"current,omitempty"`
        // Total is the end value describing when we made 100% progress for an operation.
        Total int64 `json:"total,omitempty"`
        // Start is the initial value for the operation.
        Start int64 `json:"start,omitempty"`
        // HideCounts. if true, hides the progress count indicator (xB/yB).
        HideCounts bool `json:"hidecounts,omitempty"`
        // Units is the unit to print for progress. It defaults to "bytes" if empty.
        Units string `json:"units,omitempty"`

        // terminalFd is the fd of the current terminal, if any. It is used
        // to get the terminal width.
        terminalFd uintptr

        // nowFunc is used to override the current time in tests.
        nowFunc func() time.Time

        // winSize is used to override the terminal width in tests.
        winSize int
}

func (p *JSONProgress) String() string {
        var (
                width      = p.width()
                pbBox      string
                numbersBox string
        )
        if p.Current <= 0 && p.Total <= 0 {
                return ""
        }
        if p.Total <= 0 {
                switch p.Units {
                case "":
                        return fmt.Sprintf("%8v", units.HumanSize(float64(p.Current)))
                default:
                        return fmt.Sprintf("%d %s", p.Current, p.Units)
                }
        }

        percentage := int(float64(p.Current)/float64(p.Total)*100) / 2
        if percentage > 50 {
                percentage = 50
        }
        if width > 110 {
                // this number can't be negative gh#7136
                numSpaces := 0
                if 50-percentage > 0 {
                        numSpaces = 50 - percentage
                }
                pbBox = fmt.Sprintf("[%s>%s] ", strings.Repeat("=", percentage), strings.Repeat(" ", numSpaces))
        }

        switch {
        case p.HideCounts:
        case p.Units == "": // no units, use bytes
                current := units.HumanSize(float64(p.Current))
                total := units.HumanSize(float64(p.Total))

                numbersBox = fmt.Sprintf("%8v/%v", current, total)

                if p.Current > p.Total {
                        // remove total display if the reported current is wonky.
                        numbersBox = fmt.Sprintf("%8v", current)
                }
        default:
                numbersBox = fmt.Sprintf("%d/%d %s", p.Current, p.Total, p.Units)

                if p.Current > p.Total {
                        // remove total display if the reported current is wonky.
                        numbersBox = fmt.Sprintf("%d %s", p.Current, p.Units)
                }
        }

        // Show approximation of remaining time if there's enough width.
        var timeLeftBox string
        if width > 50 {
                if p.Current > 0 && p.Start > 0 && percentage < 50 {
                        fromStart := p.now().Sub(time.Unix(p.Start, 0))
                        perEntry := fromStart / time.Duration(p.Current)
                        left := time.Duration(p.Total-p.Current) * perEntry
                        timeLeftBox = " " + left.Round(time.Second).String()
                }
        }
        return pbBox + numbersBox + timeLeftBox
}

// now returns the current time in UTC, but can be overridden in tests
// by setting JSONProgress.nowFunc to a custom function.
func (p *JSONProgress) now() time.Time {
        if p.nowFunc != nil {
                return p.nowFunc()
        }
        return time.Now().UTC()
}

// width returns the current terminal's width, but can be overridden
// in tests by setting JSONProgress.winSize to a non-zero value.
func (p *JSONProgress) width() int {
        if p.winSize != 0 {
                return p.winSize
        }
        ws, err := term.GetWinsize(p.terminalFd)
        if err == nil {
                return int(ws.Width)
        }
        return 200
}

// JSONMessage defines a message struct. It describes
// the created time, where it from, status, ID of the
// message. It's used for docker events.
type JSONMessage struct {
        Stream   string        `json:"stream,omitempty"`
        Status   string        `json:"status,omitempty"`
        Progress *JSONProgress `json:"progressDetail,omitempty"`

        // ProgressMessage is a pre-formatted presentation of [Progress].
        //
        // Deprecated: this field is deprecated since docker v0.7.1 / API v1.8. Use the information in [Progress] instead. This field will be omitted in a future release.
        ProgressMessage string     `json:"progress,omitempty"`
        ID              string     `json:"id,omitempty"`
        From            string     `json:"from,omitempty"`
        Time            int64      `json:"time,omitempty"`
        TimeNano        int64      `json:"timeNano,omitempty"`
        Error           *JSONError `json:"errorDetail,omitempty"`

        // ErrorMessage contains errors encountered during the operation.
        //
        // Deprecated: this field is deprecated since docker v0.6.0 / API v1.4. Use [Error.Message] instead. This field will be omitted in a future release.
        ErrorMessage string `json:"error,omitempty"` // deprecated
        // Aux contains out-of-band data, such as digests for push signing and image id after building.
        Aux *json.RawMessage `json:"aux,omitempty"`
}

func clearLine(out io.Writer) {
        eraseMode := aec.EraseModes.All
        cl := aec.EraseLine(eraseMode)
        fmt.Fprint(out, cl)
}

func cursorUp(out io.Writer, l uint) {
        fmt.Fprint(out, aec.Up(l))
}

func cursorDown(out io.Writer, l uint) {
        fmt.Fprint(out, aec.Down(l))
}

// Display prints the JSONMessage to out. If isTerminal is true, it erases
// the entire current line when displaying the progressbar. It returns an
// error if the [JSONMessage.Error] field is non-nil.
func (jm *JSONMessage) Display(out io.Writer, isTerminal bool) error {
        if jm.Error != nil {
                return jm.Error
        }
        var endl string
        if isTerminal && jm.Stream == "" && jm.Progress != nil {
                clearLine(out)
                endl = "\r"
                fmt.Fprint(out, endl)
        } else if jm.Progress != nil && jm.Progress.String() != "" { // disable progressbar in non-terminal
                return nil
        }
        if jm.TimeNano != 0 {
                fmt.Fprintf(out, "%s ", time.Unix(0, jm.TimeNano).Format(RFC3339NanoFixed))
        } else if jm.Time != 0 {
                fmt.Fprintf(out, "%s ", time.Unix(jm.Time, 0).Format(RFC3339NanoFixed))
        }
        if jm.ID != "" {
                fmt.Fprintf(out, "%s: ", jm.ID)
        }
        if jm.From != "" {
                fmt.Fprintf(out, "(from %s) ", jm.From)
        }
        if jm.Progress != nil && isTerminal {
                fmt.Fprintf(out, "%s %s%s", jm.Status, jm.Progress.String(), endl)
        } else if jm.ProgressMessage != "" { // deprecated
                fmt.Fprintf(out, "%s %s%s", jm.Status, jm.ProgressMessage, endl)
        } else if jm.Stream != "" {
                fmt.Fprintf(out, "%s%s", jm.Stream, endl)
        } else {
                fmt.Fprintf(out, "%s%s\n", jm.Status, endl)
        }
        return nil
}

// DisplayJSONMessagesStream reads a JSON message stream from in, and writes
// each [JSONMessage] to out. It returns an error if an invalid JSONMessage
// is received, or if a JSONMessage containers a non-zero [JSONMessage.Error].
//
// Presentation of the JSONMessage depends on whether a terminal is attached,
// and on the terminal width. Progress bars ([JSONProgress]) are suppressed
// on narrower terminals (< 110 characters).
//
//   - isTerminal describes if out is a terminal, in which case it prints
//     a newline ("\n") at the end of each line and moves the cursor while
//     displaying.
//   - terminalFd is the fd of the current terminal (if any), and used
//     to get the terminal width.
//   - auxCallback allows handling the [JSONMessage.Aux] field. It is
//     called if a JSONMessage contains an Aux field, in which case
//     DisplayJSONMessagesStream does not present the JSONMessage.
func DisplayJSONMessagesStream(in io.Reader, out io.Writer, terminalFd uintptr, isTerminal bool, auxCallback func(JSONMessage)) error {
        var (
                dec = json.NewDecoder(in)
                ids = make(map[string]uint)
        )

        for {
                var diff uint
                var jm JSONMessage
                if err := dec.Decode(&jm); err != nil {
                        if err == io.EOF {
                                break
                        }
                        return err
                }

                if jm.Aux != nil {
                        if auxCallback != nil {
                                auxCallback(jm)
                        }
                        continue
                }

                if jm.Progress != nil {
                        jm.Progress.terminalFd = terminalFd
                }
                if jm.ID != "" && (jm.Progress != nil || jm.ProgressMessage != "") {
                        line, ok := ids[jm.ID]
                        if !ok {
                                // NOTE: This approach of using len(id) to
                                // figure out the number of lines of history
                                // only works as long as we clear the history
                                // when we output something that's not
                                // accounted for in the map, such as a line
                                // with no ID.
                                line = uint(len(ids))
                                ids[jm.ID] = line
                                if isTerminal {
                                        fmt.Fprintf(out, "\n")
                                }
                        }
                        diff = uint(len(ids)) - line
                        if isTerminal {
                                cursorUp(out, diff)
                        }
                } else {
                        // When outputting something that isn't progress
                        // output, clear the history of previous lines. We
                        // don't want progress entries from some previous
                        // operation to be updated (for example, pull -a
                        // with multiple tags).
                        ids = make(map[string]uint)
                }
                err := jm.Display(out, isTerminal)
                if jm.ID != "" && isTerminal {
                        cursorDown(out, diff)
                }
                if err != nil {
                        return err
                }
        }
        return nil
}

// Stream is an io.Writer for output with utilities to get the output's file
// descriptor and to detect whether it's a terminal.
//
// it is subset of the streams.Out type in
// https://pkg.go.dev/github.com/docker/cli@v20.10.17+incompatible/cli/streams#Out
type Stream interface {
        io.Writer
        FD() uintptr
        IsTerminal() bool
}

// DisplayJSONMessagesToStream prints json messages to the output Stream. It is
// used by the Docker CLI to print JSONMessage streams.
func DisplayJSONMessagesToStream(in io.Reader, stream Stream, auxCallback func(JSONMessage)) error {
        return DisplayJSONMessagesStream(in, stream, stream.FD(), stream.IsTerminal(), auxCallback)
}

// Copyright 2022 Google LLC. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package jsonmessage

import (
        "bytes"
)

func FuzzDisplayJSONMessagesStream(data []byte) int {
        out := bytes.NewBuffer(nil)
        in := bytes.NewBuffer(data)
        DisplayJSONMessagesStream(in, out, 0, false, nil)
        return 1
}

// Package longpath introduces some constants and helper functions for handling
// long paths in Windows.
//
// Long paths are expected to be prepended with "\\?\" and followed by either a
// drive letter, a UNC server\share, or a volume identifier.
package longpath

import (
        "os"
        "runtime"
        "strings"
)

// longPathPrefix is the longpath prefix for Windows file paths.
const longPathPrefix = `\\?\`

// AddPrefix adds the Windows long path prefix to the path provided if
// it does not already have it.
func AddPrefix(path string) string {
        if strings.HasPrefix(path, longPathPrefix) {
                return path
        }
        if strings.HasPrefix(path, `\\`) {
                // This is a UNC path, so we need to add 'UNC' to the path as well.
                return longPathPrefix + `UNC` + path[1:]
        }
        return longPathPrefix + path
}

// MkdirTemp is the equivalent of [os.MkdirTemp], except that on Windows
// the result is in Windows longpath format. On Unix systems it is
// equivalent to [os.MkdirTemp].
func MkdirTemp(dir, prefix string) (string, error) {
        tempDir, err := os.MkdirTemp(dir, prefix)
        if err != nil {
                return "", err
        }
        if runtime.GOOS != "windows" {
                return tempDir, nil
        }
        return AddPrefix(tempDir), nil
}

// Package namesgenerator generates random names.
//
// This package is officially "frozen" - no new additions will be accepted.
//
// For a long time, this package provided a lot of joy within the project, but
// at some point the conflicts of opinion became greater than the added joy.
//
// At some future time, this may be replaced with something that sparks less
// controversy, but for now it will remain as-is.
//
// See also https://github.com/moby/moby/pull/43210#issuecomment-1029934277
package namesgenerator

import (
        "math/rand"
        "strconv"
)

var (
        left = [...]string{
                "admiring",
                "adoring",
                "affectionate",
                "agitated",
                "amazing",
                "angry",
                "awesome",
                "beautiful",
                "blissful",
                "bold",
                "boring",
                "brave",
                "busy",
                "charming",
                "clever",
                "compassionate",
                "competent",
                "condescending",
                "confident",
                "cool",
                "cranky",
                "crazy",
                "dazzling",
                "determined",
                "distracted",
                "dreamy",
                "eager",
                "ecstatic",
                "elastic",
                "elated",
                "elegant",
                "eloquent",
                "epic",
                "exciting",
                "fervent",
                "festive",
                "flamboyant",
                "focused",
                "friendly",
                "frosty",
                "funny",
                "gallant",
                "gifted",
                "goofy",
                "gracious",
                "great",
                "happy",
                "hardcore",
                "heuristic",
                "hopeful",
                "hungry",
                "infallible",
                "inspiring",
                "intelligent",
                "interesting",
                "jolly",
                "jovial",
                "keen",
                "kind",
                "laughing",
                "loving",
                "lucid",
                "magical",
                "modest",
                "musing",
                "mystifying",
                "naughty",
                "nervous",
                "nice",
                "nifty",
                "nostalgic",
                "objective",
                "optimistic",
                "peaceful",
                "pedantic",
                "pensive",
                "practical",
                "priceless",
                "quirky",
                "quizzical",
                "recursing",
                "relaxed",
                "reverent",
                "romantic",
                "sad",
                "serene",
                "sharp",
                "silly",
                "sleepy",
                "stoic",
                "strange",
                "stupefied",
                "suspicious",
                "sweet",
                "tender",
                "thirsty",
                "trusting",
                "unruffled",
                "upbeat",
                "vibrant",
                "vigilant",
                "vigorous",
                "wizardly",
                "wonderful",
                "xenodochial",
                "youthful",
                "zealous",
                "zen",
        }

        // Docker, starting from 0.7.x, generates names from notable scientists and hackers.
        // Please, for any amazing man that you add to the list, consider adding an equally amazing woman to it, and vice versa.
        right = [...]string{
                // Maria Gaetana Agnesi - Italian mathematician, philosopher, theologian and humanitarian. She was the first woman to write a mathematics handbook and the first woman appointed as a Mathematics Professor at a University. https://en.wikipedia.org/wiki/Maria_Gaetana_Agnesi
                "agnesi",

                // Muhammad ibn Jābir al-Ḥarrānī al-Battānī was a founding father of astronomy. https://en.wikipedia.org/wiki/Mu%E1%B8%A5ammad_ibn_J%C4%81bir_al-%E1%B8%A4arr%C4%81n%C4%AB_al-Batt%C4%81n%C4%AB
                "albattani",

                // Frances E. Allen, became the first female IBM Fellow in 1989. In 2006, she became the first female recipient of the ACM's Turing Award. https://en.wikipedia.org/wiki/Frances_E._Allen
                "allen",

                // June Almeida - Scottish virologist who took the first pictures of the rubella virus - https://en.wikipedia.org/wiki/June_Almeida
                "almeida",

                // Kathleen Antonelli, American computer programmer and one of the six original programmers of the ENIAC - https://en.wikipedia.org/wiki/Kathleen_Antonelli
                "antonelli",

                // Archimedes was a physicist, engineer and mathematician who invented too many things to list them here. https://en.wikipedia.org/wiki/Archimedes
                "archimedes",

                // Maria Ardinghelli - Italian translator, mathematician and physicist - https://en.wikipedia.org/wiki/Maria_Ardinghelli
                "ardinghelli",

                // Aryabhata - Ancient Indian mathematician-astronomer during 476-550 CE https://en.wikipedia.org/wiki/Aryabhata
                "aryabhata",

                // Wanda Austin - Wanda Austin is the President and CEO of The Aerospace Corporation, a leading architect for the US security space programs. https://en.wikipedia.org/wiki/Wanda_Austin
                "austin",

                // Charles Babbage invented the concept of a programmable computer. https://en.wikipedia.org/wiki/Charles_Babbage.
                "babbage",

                // Stefan Banach - Polish mathematician, was one of the founders of modern functional analysis. https://en.wikipedia.org/wiki/Stefan_Banach
                "banach",

                // Buckaroo Banzai and his mentor Dr. Hikita perfected the "oscillation overthruster", a device that allows one to pass through solid matter. - https://en.wikipedia.org/wiki/The_Adventures_of_Buckaroo_Banzai_Across_the_8th_Dimension
                "banzai",

                // John Bardeen co-invented the transistor - https://en.wikipedia.org/wiki/John_Bardeen
                "bardeen",

                // Jean Bartik, born Betty Jean Jennings, was one of the original programmers for the ENIAC computer. https://en.wikipedia.org/wiki/Jean_Bartik
                "bartik",

                // Laura Bassi, the world's first female professor https://en.wikipedia.org/wiki/Laura_Bassi
                "bassi",

                // Hugh Beaver, British engineer, founder of the Guinness Book of World Records https://en.wikipedia.org/wiki/Hugh_Beaver
                "beaver",

                // Alexander Graham Bell - an eminent Scottish-born scientist, inventor, engineer and innovator who is credited with inventing the first practical telephone - https://en.wikipedia.org/wiki/Alexander_Graham_Bell
                "bell",

                // Karl Friedrich Benz - a German automobile engineer. Inventor of the first practical motorcar. https://en.wikipedia.org/wiki/Karl_Benz
                "benz",

                // Homi J Bhabha - was an Indian nuclear physicist, founding director, and professor of physics at the Tata Institute of Fundamental Research. Colloquially known as "father of Indian nuclear programme"- https://en.wikipedia.org/wiki/Homi_J._Bhabha
                "bhabha",

                // Bhaskara II - Ancient Indian mathematician-astronomer whose work on calculus predates Newton and Leibniz by over half a millennium - https://en.wikipedia.org/wiki/Bh%C4%81skara_II#Calculus
                "bhaskara",

                // Sue Black - British computer scientist and campaigner. She has been instrumental in saving Bletchley Park, the site of World War II codebreaking - https://en.wikipedia.org/wiki/Sue_Black_(computer_scientist)
                "black",

                // Elizabeth Helen Blackburn - Australian-American Nobel laureate; best known for co-discovering telomerase. https://en.wikipedia.org/wiki/Elizabeth_Blackburn
                "blackburn",

                // Elizabeth Blackwell - American doctor and first American woman to receive a medical degree - https://en.wikipedia.org/wiki/Elizabeth_Blackwell
                "blackwell",

                // Niels Bohr is the father of quantum theory. https://en.wikipedia.org/wiki/Niels_Bohr.
                "bohr",

                // Kathleen Booth, she's credited with writing the first assembly language. https://en.wikipedia.org/wiki/Kathleen_Booth
                "booth",

                // Anita Borg - Anita Borg was the founding director of the Institute for Women and Technology (IWT). https://en.wikipedia.org/wiki/Anita_Borg
                "borg",

                // Satyendra Nath Bose - He provided the foundation for Bose–Einstein statistics and the theory of the Bose–Einstein condensate. - https://en.wikipedia.org/wiki/Satyendra_Nath_Bose
                "bose",

                // Katherine Louise Bouman is an imaging scientist and Assistant Professor of Computer Science at the California Institute of Technology. She researches computational methods for imaging, and developed an algorithm that made possible the picture first visualization of a black hole using the Event Horizon Telescope. - https://en.wikipedia.org/wiki/Katie_Bouman
                "bouman",

                // Evelyn Boyd Granville - She was one of the first African-American woman to receive a Ph.D. in mathematics; she earned it in 1949 from Yale University. https://en.wikipedia.org/wiki/Evelyn_Boyd_Granville
                "boyd",

                // Brahmagupta - Ancient Indian mathematician during 598-670 CE who gave rules to compute with zero - https://en.wikipedia.org/wiki/Brahmagupta#Zero
                "brahmagupta",

                // Walter Houser Brattain co-invented the transistor - https://en.wikipedia.org/wiki/Walter_Houser_Brattain
                "brattain",

                // Emmett Brown invented time travel. https://en.wikipedia.org/wiki/Emmett_Brown (thanks Brian Goff)
                "brown",

                // Linda Brown Buck - American biologist and Nobel laureate best known for her genetic and molecular analyses of the mechanisms of smell. https://en.wikipedia.org/wiki/Linda_B._Buck
                "buck",

                // Dame Susan Jocelyn Bell Burnell - Northern Irish astrophysicist who discovered radio pulsars and was the first to analyse them. https://en.wikipedia.org/wiki/Jocelyn_Bell_Burnell
                "burnell",

                // Annie Jump Cannon - pioneering female astronomer who classified hundreds of thousands of stars and created the system we use to understand stars today. https://en.wikipedia.org/wiki/Annie_Jump_Cannon
                "cannon",

                // Rachel Carson - American marine biologist and conservationist, her book Silent Spring and other writings are credited with advancing the global environmental movement. https://en.wikipedia.org/wiki/Rachel_Carson
                "carson",

                // Dame Mary Lucy Cartwright - British mathematician who was one of the first to study what is now known as chaos theory. Also known for Cartwright's theorem which finds applications in signal processing. https://en.wikipedia.org/wiki/Mary_Cartwright
                "cartwright",

                // George Washington Carver - American agricultural scientist and inventor. He was the most prominent black scientist of the early 20th century. https://en.wikipedia.org/wiki/George_Washington_Carver
                "carver",

                // Vinton Gray Cerf - American Internet pioneer, recognised as one of "the fathers of the Internet". With Robert Elliot Kahn, he designed TCP and IP, the primary data communication protocols of the Internet and other computer networks. https://en.wikipedia.org/wiki/Vint_Cerf
                "cerf",

                // Subrahmanyan Chandrasekhar - Astrophysicist known for his mathematical theory on different stages and evolution in structures of the stars. He has won nobel prize for physics - https://en.wikipedia.org/wiki/Subrahmanyan_Chandrasekhar
                "chandrasekhar",

                // Sergey Alexeyevich Chaplygin (Russian: Серге́й Алексе́евич Чаплы́гин; April 5, 1869 – October 8, 1942) was a Russian and Soviet physicist, mathematician, and mechanical engineer. He is known for mathematical formulas such as Chaplygin's equation and for a hypothetical substance in cosmology called Chaplygin gas, named after him. https://en.wikipedia.org/wiki/Sergey_Chaplygin
                "chaplygin",

                // Émilie du Châtelet - French natural philosopher, mathematician, physicist, and author during the early 1730s, known for her translation of and commentary on Isaac Newton's book Principia containing basic laws of physics. https://en.wikipedia.org/wiki/%C3%89milie_du_Ch%C3%A2telet
                "chatelet",

                // Asima Chatterjee was an Indian organic chemist noted for her research on vinca alkaloids, development of drugs for treatment of epilepsy and malaria - https://en.wikipedia.org/wiki/Asima_Chatterjee
                "chatterjee",

                // David Lee Chaum - American computer scientist and cryptographer. Known for his seminal contributions in the field of anonymous communication. https://en.wikipedia.org/wiki/David_Chaum
                "chaum",

                // Pafnuty Chebyshev - Russian mathematician. He is known fo his works on probability, statistics, mechanics, analytical geometry and number theory https://en.wikipedia.org/wiki/Pafnuty_Chebyshev
                "chebyshev",

                // Joan Clarke - Bletchley Park code breaker during the Second World War who pioneered techniques that remained top secret for decades. Also an accomplished numismatist https://en.wikipedia.org/wiki/Joan_Clarke
                "clarke",

                // Bram Cohen - American computer programmer and author of the BitTorrent peer-to-peer protocol. https://en.wikipedia.org/wiki/Bram_Cohen
                "cohen",

                // Jane Colden - American botanist widely considered the first female American botanist - https://en.wikipedia.org/wiki/Jane_Colden
                "colden",

                // Gerty Theresa Cori - American biochemist who became the third woman—and first American woman—to win a Nobel Prize in science, and the first woman to be awarded the Nobel Prize in Physiology or Medicine. Cori was born in Prague. https://en.wikipedia.org/wiki/Gerty_Cori
                "cori",

                // Seymour Roger Cray was an American electrical engineer and supercomputer architect who designed a series of computers that were the fastest in the world for decades. https://en.wikipedia.org/wiki/Seymour_Cray
                "cray",

                // Marie Curie discovered radioactivity. https://en.wikipedia.org/wiki/Marie_Curie.
                "curie",

                // This entry reflects a husband and wife team who worked together:
                // Joan Curran was a Welsh scientist who developed radar and invented chaff, a radar countermeasure. https://en.wikipedia.org/wiki/Joan_Curran
                // Samuel Curran was an Irish physicist who worked alongside his wife during WWII and invented the proximity fuse. https://en.wikipedia.org/wiki/Samuel_Curran
                "curran",

                // Charles Darwin established the principles of natural evolution. https://en.wikipedia.org/wiki/Charles_Darwin.
                "darwin",

                // Leonardo Da Vinci invented too many things to list here. https://en.wikipedia.org/wiki/Leonardo_da_Vinci.
                "davinci",

                // A. K. (Alexander Keewatin) Dewdney, Canadian mathematician, computer scientist, author and filmmaker. Contributor to Scientific American's "Computer Recreations" from 1984 to 1991. Author of Core War (program), The Planiverse, The Armchair Universe, The Magic Machine, The New Turing Omnibus, and more. https://en.wikipedia.org/wiki/Alexander_Dewdney
                "dewdney",

                // Satish Dhawan - Indian mathematician and aerospace engineer, known for leading the successful and indigenous development of the Indian space programme. https://en.wikipedia.org/wiki/Satish_Dhawan
                "dhawan",

                // Bailey Whitfield Diffie - American cryptographer and one of the pioneers of public-key cryptography. https://en.wikipedia.org/wiki/Whitfield_Diffie
                "diffie",

                // Edsger Wybe Dijkstra was a Dutch computer scientist and mathematical scientist. https://en.wikipedia.org/wiki/Edsger_W._Dijkstra.
                "dijkstra",

                // Paul Adrien Maurice Dirac - English theoretical physicist who made fundamental contributions to the early development of both quantum mechanics and quantum electrodynamics. https://en.wikipedia.org/wiki/Paul_Dirac
                "dirac",

                // Agnes Meyer Driscoll - American cryptanalyst during World Wars I and II who successfully cryptanalysed a number of Japanese ciphers. She was also the co-developer of one of the cipher machines of the US Navy, the CM. https://en.wikipedia.org/wiki/Agnes_Meyer_Driscoll
                "driscoll",

                // Donna Dubinsky - played an integral role in the development of personal digital assistants (PDAs) serving as CEO of Palm, Inc. and co-founding Handspring. https://en.wikipedia.org/wiki/Donna_Dubinsky
                "dubinsky",

                // Annie Easley - She was a leading member of the team which developed software for the Centaur rocket stage and one of the first African-Americans in her field. https://en.wikipedia.org/wiki/Annie_Easley
                "easley",

                // Thomas Alva Edison, prolific inventor https://en.wikipedia.org/wiki/Thomas_Edison
                "edison",

                // Albert Einstein invented the general theory of relativity. https://en.wikipedia.org/wiki/Albert_Einstein
                "einstein",

                // Alexandra Asanovna Elbakyan (Russian: Алекса́ндра Аса́новна Элбакя́н) is a Kazakhstani graduate student, computer programmer, internet pirate in hiding, and the creator of the site Sci-Hub. Nature has listed her in 2016 in the top ten people that mattered in science, and Ars Technica has compared her to Aaron Swartz. - https://en.wikipedia.org/wiki/Alexandra_Elbakyan
                "elbakyan",

                // Taher A. ElGamal - Egyptian cryptographer best known for the ElGamal discrete log cryptosystem and the ElGamal digital signature scheme. https://en.wikipedia.org/wiki/Taher_Elgamal
                "elgamal",

                // Gertrude Elion - American biochemist, pharmacologist and the 1988 recipient of the Nobel Prize in Medicine - https://en.wikipedia.org/wiki/Gertrude_Elion
                "elion",

                // James Henry Ellis - British engineer and cryptographer employed by the GCHQ. Best known for conceiving for the first time, the idea of public-key cryptography. https://en.wikipedia.org/wiki/James_H._Ellis
                "ellis",

                // Douglas Engelbart gave the mother of all demos: https://en.wikipedia.org/wiki/Douglas_Engelbart
                "engelbart",

                // Euclid invented geometry. https://en.wikipedia.org/wiki/Euclid
                "euclid",

                // Leonhard Euler invented large parts of modern mathematics. https://de.wikipedia.org/wiki/Leonhard_Euler
                "euler",

                // Michael Faraday - British scientist who contributed to the study of electromagnetism and electrochemistry. https://en.wikipedia.org/wiki/Michael_Faraday
                "faraday",

                // Horst Feistel - German-born American cryptographer who was one of the earliest non-government researchers to study the design and theory of block ciphers. Co-developer of DES and Lucifer. Feistel networks, a symmetric structure used in the construction of block ciphers are named after him. https://en.wikipedia.org/wiki/Horst_Feistel
                "feistel",

                // Pierre de Fermat pioneered several aspects of modern mathematics. https://en.wikipedia.org/wiki/Pierre_de_Fermat
                "fermat",

                // Enrico Fermi invented the first nuclear reactor. https://en.wikipedia.org/wiki/Enrico_Fermi.
                "fermi",

                // Richard Feynman was a key contributor to quantum mechanics and particle physics. https://en.wikipedia.org/wiki/Richard_Feynman
                "feynman",

                // Benjamin Franklin is famous for his experiments in electricity and the invention of the lightning rod.
                "franklin",

                // Yuri Alekseyevich Gagarin - Soviet pilot and cosmonaut, best known as the first human to journey into outer space. https://en.wikipedia.org/wiki/Yuri_Gagarin
                "gagarin",

                // Galileo was a founding father of modern astronomy, and faced politics and obscurantism to establish scientific truth.  https://en.wikipedia.org/wiki/Galileo_Galilei
                "galileo",

                // Évariste Galois - French mathematician whose work laid the foundations of Galois theory and group theory, two major branches of abstract algebra, and the subfield of Galois connections, all while still in his late teens. https://en.wikipedia.org/wiki/%C3%89variste_Galois
                "galois",

                // Kadambini Ganguly - Indian physician, known for being the first South Asian female physician, trained in western medicine, to graduate in South Asia. https://en.wikipedia.org/wiki/Kadambini_Ganguly
                "ganguly",

                // William Henry "Bill" Gates III is an American business magnate, philanthropist, investor, computer programmer, and inventor. https://en.wikipedia.org/wiki/Bill_Gates
                "gates",

                // Johann Carl Friedrich Gauss - German mathematician who made significant contributions to many fields, including number theory, algebra, statistics, analysis, differential geometry, geodesy, geophysics, mechanics, electrostatics, magnetic fields, astronomy, matrix theory, and optics. https://en.wikipedia.org/wiki/Carl_Friedrich_Gauss
                "gauss",

                // Marie-Sophie Germain - French mathematician, physicist and philosopher. Known for her work on elasticity theory, number theory and philosophy. https://en.wikipedia.org/wiki/Sophie_Germain
                "germain",

                // Adele Goldberg, was one of the designers and developers of the Smalltalk language. https://en.wikipedia.org/wiki/Adele_Goldberg_(computer_scientist)
                "goldberg",

                // Adele Goldstine, born Adele Katz, wrote the complete technical description for the first electronic digital computer, ENIAC. https://en.wikipedia.org/wiki/Adele_Goldstine
                "goldstine",

                // Shafi Goldwasser is a computer scientist known for creating theoretical foundations of modern cryptography. Winner of 2012 ACM Turing Award. https://en.wikipedia.org/wiki/Shafi_Goldwasser
                "goldwasser",

                // James Golick, all around gangster.
                "golick",

                // Jane Goodall - British primatologist, ethologist, and anthropologist who is considered to be the world's foremost expert on chimpanzees - https://en.wikipedia.org/wiki/Jane_Goodall
                "goodall",

                // Stephen Jay Gould was an American paleontologist, evolutionary biologist, and historian of science. He is most famous for the theory of punctuated equilibrium - https://en.wikipedia.org/wiki/Stephen_Jay_Gould
                "gould",

                // Carolyn Widney Greider - American molecular biologist and joint winner of the 2009 Nobel Prize for Physiology or Medicine for the discovery of telomerase. https://en.wikipedia.org/wiki/Carol_W._Greider
                "greider",

                // Alexander Grothendieck - German-born French mathematician who became a leading figure in the creation of modern algebraic geometry. https://en.wikipedia.org/wiki/Alexander_Grothendieck
                "grothendieck",

                // Lois Haibt - American computer scientist, part of the team at IBM that developed FORTRAN - https://en.wikipedia.org/wiki/Lois_Haibt
                "haibt",

                // Margaret Hamilton - Director of the Software Engineering Division of the MIT Instrumentation Laboratory, which developed on-board flight software for the Apollo space program. https://en.wikipedia.org/wiki/Margaret_Hamilton_(scientist)
                "hamilton",

                // Caroline Harriet Haslett - English electrical engineer, electricity industry administrator and champion of women's rights. Co-author of British Standard 1363 that specifies AC power plugs and sockets used across the United Kingdom (which is widely considered as one of the safest designs). https://en.wikipedia.org/wiki/Caroline_Haslett
                "haslett",

                // Stephen Hawking pioneered the field of cosmology by combining general relativity and quantum mechanics. https://en.wikipedia.org/wiki/Stephen_Hawking
                "hawking",

                // Werner Heisenberg was a founding father of quantum mechanics. https://en.wikipedia.org/wiki/Werner_Heisenberg
                "heisenberg",

                // Martin Edward Hellman - American cryptologist, best known for his invention of public-key cryptography in co-operation with Whitfield Diffie and Ralph Merkle. https://en.wikipedia.org/wiki/Martin_Hellman
                "hellman",

                // Grete Hermann was a German philosopher noted for her philosophical work on the foundations of quantum mechanics. https://en.wikipedia.org/wiki/Grete_Hermann
                "hermann",

                // Caroline Lucretia Herschel - German astronomer and discoverer of several comets. https://en.wikipedia.org/wiki/Caroline_Herschel
                "herschel",

                // Heinrich Rudolf Hertz - German physicist who first conclusively proved the existence of the electromagnetic waves. https://en.wikipedia.org/wiki/Heinrich_Hertz
                "hertz",

                // Jaroslav Heyrovský was the inventor of the polarographic method, father of the electroanalytical method, and recipient of the Nobel Prize in 1959. His main field of work was polarography. https://en.wikipedia.org/wiki/Jaroslav_Heyrovsk%C3%BD
                "heyrovsky",

                // Dorothy Hodgkin was a British biochemist, credited with the development of protein crystallography. She was awarded the Nobel Prize in Chemistry in 1964. https://en.wikipedia.org/wiki/Dorothy_Hodgkin
                "hodgkin",

                // Douglas R. Hofstadter is an American professor of cognitive science and author of the Pulitzer Prize and American Book Award-winning work Goedel, Escher, Bach: An Eternal Golden Braid in 1979. A mind-bending work which coined Hofstadter's Law: "It always takes longer than you expect, even when you take into account Hofstadter's Law." https://en.wikipedia.org/wiki/Douglas_Hofstadter
                "hofstadter",

                // Erna Schneider Hoover revolutionized modern communication by inventing a computerized telephone switching method. https://en.wikipedia.org/wiki/Erna_Schneider_Hoover
                "hoover",

                // Grace Hopper developed the first compiler for a computer programming language and  is credited with popularizing the term "debugging" for fixing computer glitches. https://en.wikipedia.org/wiki/Grace_Hopper
                "hopper",

                // Frances Hugle, she was an American scientist, engineer, and inventor who contributed to the understanding of semiconductors, integrated circuitry, and the unique electrical principles of microscopic materials. https://en.wikipedia.org/wiki/Frances_Hugle
                "hugle",

                // Hypatia - Greek Alexandrine Neoplatonist philosopher in Egypt who was one of the earliest mothers of mathematics - https://en.wikipedia.org/wiki/Hypatia
                "hypatia",

                // Teruko Ishizaka - Japanese scientist and immunologist who co-discovered the antibody class Immunoglobulin E. https://en.wikipedia.org/wiki/Teruko_Ishizaka
                "ishizaka",

                // Mary Jackson, American mathematician and aerospace engineer who earned the highest title within NASA's engineering department - https://en.wikipedia.org/wiki/Mary_Jackson_(engineer)
                "jackson",

                // Yeong-Sil Jang was a Korean scientist and astronomer during the Joseon Dynasty; he invented the first metal printing press and water gauge. https://en.wikipedia.org/wiki/Jang_Yeong-sil
                "jang",

                // Mae Carol Jemison -  is an American engineer, physician, and former NASA astronaut. She became the first black woman to travel in space when she served as a mission specialist aboard the Space Shuttle Endeavour - https://en.wikipedia.org/wiki/Mae_Jemison
                "jemison",

                // Betty Jennings - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Jean_Bartik
                "jennings",

                // Mary Lou Jepsen, was the founder and chief technology officer of One Laptop Per Child (OLPC), and the founder of Pixel Qi. https://en.wikipedia.org/wiki/Mary_Lou_Jepsen
                "jepsen",

                // Katherine Coleman Goble Johnson - American physicist and mathematician contributed to the NASA. https://en.wikipedia.org/wiki/Katherine_Johnson
                "johnson",

                // Irène Joliot-Curie - French scientist who was awarded the Nobel Prize for Chemistry in 1935. Daughter of Marie and Pierre Curie. https://en.wikipedia.org/wiki/Ir%C3%A8ne_Joliot-Curie
                "joliot",

                // Karen Spärck Jones came up with the concept of inverse document frequency, which is used in most search engines today. https://en.wikipedia.org/wiki/Karen_Sp%C3%A4rck_Jones
                "jones",

                // A. P. J. Abdul Kalam - is an Indian scientist aka Missile Man of India for his work on the development of ballistic missile and launch vehicle technology - https://en.wikipedia.org/wiki/A._P._J._Abdul_Kalam
                "kalam",

                // Sergey Petrovich Kapitsa (Russian: Серге́й Петро́вич Капи́ца; 14 February 1928 – 14 August 2012) was a Russian physicist and demographer. He was best known as host of the popular and long-running Russian scientific TV show, Evident, but Incredible. His father was the Nobel laureate Soviet-era physicist Pyotr Kapitsa, and his brother was the geographer and Antarctic explorer Andrey Kapitsa. - https://en.wikipedia.org/wiki/Sergey_Kapitsa
                "kapitsa",

                // Susan Kare, created the icons and many of the interface elements for the original Apple Macintosh in the 1980s, and was an original employee of NeXT, working as the Creative Director. https://en.wikipedia.org/wiki/Susan_Kare
                "kare",

                // Mstislav Keldysh - a Soviet scientist in the field of mathematics and mechanics, academician of the USSR Academy of Sciences (1946), President of the USSR Academy of Sciences (1961–1975), three times Hero of Socialist Labor (1956, 1961, 1971), fellow of the Royal Society of Edinburgh (1968). https://en.wikipedia.org/wiki/Mstislav_Keldysh
                "keldysh",

                // Mary Kenneth Keller, Sister Mary Kenneth Keller became the first American woman to earn a PhD in Computer Science in 1965. https://en.wikipedia.org/wiki/Mary_Kenneth_Keller
                "keller",

                // Johannes Kepler, German astronomer known for his three laws of planetary motion - https://en.wikipedia.org/wiki/Johannes_Kepler
                "kepler",

                // Omar Khayyam - Persian mathematician, astronomer and poet. Known for his work on the classification and solution of cubic equations, for his contribution to the understanding of Euclid's fifth postulate and for computing the length of a year very accurately. https://en.wikipedia.org/wiki/Omar_Khayyam
                "khayyam",

                // Har Gobind Khorana - Indian-American biochemist who shared the 1968 Nobel Prize for Physiology - https://en.wikipedia.org/wiki/Har_Gobind_Khorana
                "khorana",

                // Jack Kilby invented silicon integrated circuits and gave Silicon Valley its name. - https://en.wikipedia.org/wiki/Jack_Kilby
                "kilby",

                // Maria Kirch - German astronomer and first woman to discover a comet - https://en.wikipedia.org/wiki/Maria_Margarethe_Kirch
                "kirch",

                // Donald Knuth - American computer scientist, author of "The Art of Computer Programming" and creator of the TeX typesetting system. https://en.wikipedia.org/wiki/Donald_Knuth
                "knuth",

                // Sophie Kowalevski - Russian mathematician responsible for important original contributions to analysis, differential equations and mechanics - https://en.wikipedia.org/wiki/Sofia_Kovalevskaya
                "kowalevski",

                // Marie-Jeanne de Lalande - French astronomer, mathematician and cataloguer of stars - https://en.wikipedia.org/wiki/Marie-Jeanne_de_Lalande
                "lalande",

                // Hedy Lamarr - Actress and inventor. The principles of her work are now incorporated into modern Wi-Fi, CDMA and Bluetooth technology. https://en.wikipedia.org/wiki/Hedy_Lamarr
                "lamarr",

                // Leslie B. Lamport - American computer scientist. Lamport is best known for his seminal work in distributed systems and was the winner of the 2013 Turing Award. https://en.wikipedia.org/wiki/Leslie_Lamport
                "lamport",

                // Mary Leakey - British paleoanthropologist who discovered the first fossilized Proconsul skull - https://en.wikipedia.org/wiki/Mary_Leakey
                "leakey",

                // Henrietta Swan Leavitt - she was an American astronomer who discovered the relation between the luminosity and the period of Cepheid variable stars. https://en.wikipedia.org/wiki/Henrietta_Swan_Leavitt
                "leavitt",

                // Esther Miriam Zimmer Lederberg - American microbiologist and a pioneer of bacterial genetics. https://en.wikipedia.org/wiki/Esther_Lederberg
                "lederberg",

                // Inge Lehmann - Danish seismologist and geophysicist. Known for discovering in 1936 that the Earth has a solid inner core inside a molten outer core. https://en.wikipedia.org/wiki/Inge_Lehmann
                "lehmann",

                // Daniel Lewin - Mathematician, Akamai co-founder, soldier, 9/11 victim-- Developed optimization techniques for routing traffic on the internet. Died attempting to stop the 9-11 hijackers. https://en.wikipedia.org/wiki/Daniel_Lewin
                "lewin",

                // Ruth Lichterman - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Ruth_Teitelbaum
                "lichterman",

                // Barbara Liskov - co-developed the Liskov substitution principle. Liskov was also the winner of the Turing Prize in 2008. - https://en.wikipedia.org/wiki/Barbara_Liskov
                "liskov",

                // Ada Lovelace invented the first algorithm. https://en.wikipedia.org/wiki/Ada_Lovelace (thanks James Turnbull)
                "lovelace",

                // Auguste and Louis Lumière - the first filmmakers in history - https://en.wikipedia.org/wiki/Auguste_and_Louis_Lumi%C3%A8re
                "lumiere",

                // Mahavira - Ancient Indian mathematician during 9th century AD who discovered basic algebraic identities - https://en.wikipedia.org/wiki/Mah%C4%81v%C4%ABra_(mathematician)
                "mahavira",

                // Lynn Margulis (b. Lynn Petra Alexander) - an American evolutionary theorist and biologist, science author, educator, and popularizer, and was the primary modern proponent for the significance of symbiosis in evolution. - https://en.wikipedia.org/wiki/Lynn_Margulis
                "margulis",

                // Yukihiro Matsumoto - Japanese computer scientist and software programmer best known as the chief designer of the Ruby programming language. https://en.wikipedia.org/wiki/Yukihiro_Matsumoto
                "matsumoto",

                // James Clerk Maxwell - Scottish physicist, best known for his formulation of electromagnetic theory. https://en.wikipedia.org/wiki/James_Clerk_Maxwell
                "maxwell",

                // Maria Mayer - American theoretical physicist and Nobel laureate in Physics for proposing the nuclear shell model of the atomic nucleus - https://en.wikipedia.org/wiki/Maria_Mayer
                "mayer",

                // John McCarthy invented LISP: https://en.wikipedia.org/wiki/John_McCarthy_(computer_scientist)
                "mccarthy",

                // Barbara McClintock - a distinguished American cytogeneticist, 1983 Nobel Laureate in Physiology or Medicine for discovering transposons. https://en.wikipedia.org/wiki/Barbara_McClintock
                "mcclintock",

                // Anne Laura Dorinthea McLaren - British developmental biologist whose work helped lead to human in-vitro fertilisation. https://en.wikipedia.org/wiki/Anne_McLaren
                "mclaren",

                // Malcolm McLean invented the modern shipping container: https://en.wikipedia.org/wiki/Malcom_McLean
                "mclean",

                // Kay McNulty - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Kathleen_Antonelli
                "mcnulty",

                // Lise Meitner - Austrian/Swedish physicist who was involved in the discovery of nuclear fission. The element meitnerium is named after her - https://en.wikipedia.org/wiki/Lise_Meitner
                "meitner",

                // Gregor Johann Mendel - Czech scientist and founder of genetics. https://en.wikipedia.org/wiki/Gregor_Mendel
                "mendel",

                // Dmitri Mendeleev - a chemist and inventor. He formulated the Periodic Law, created a farsighted version of the periodic table of elements, and used it to correct the properties of some already discovered elements and also to predict the properties of eight elements yet to be discovered. https://en.wikipedia.org/wiki/Dmitri_Mendeleev
                "mendeleev",

                // Carla Meninsky, was the game designer and programmer for Atari 2600 games Dodge 'Em and Warlords. https://en.wikipedia.org/wiki/Carla_Meninsky
                "meninsky",

                // Ralph C. Merkle - American computer scientist, known for devising Merkle's puzzles - one of the very first schemes for public-key cryptography. Also, inventor of Merkle trees and co-inventor of the Merkle-Damgård construction for building collision-resistant cryptographic hash functions and the Merkle-Hellman knapsack cryptosystem. https://en.wikipedia.org/wiki/Ralph_Merkle
                "merkle",

                // Johanna Mestorf - German prehistoric archaeologist and first female museum director in Germany - https://en.wikipedia.org/wiki/Johanna_Mestorf
                "mestorf",

                // Maryam Mirzakhani - an Iranian mathematician and the first woman to win the Fields Medal. https://en.wikipedia.org/wiki/Maryam_Mirzakhani
                "mirzakhani",

                // Rita Levi-Montalcini - Won Nobel Prize in Physiology or Medicine jointly with colleague Stanley Cohen for the discovery of nerve growth factor (https://en.wikipedia.org/wiki/Rita_Levi-Montalcini)
                "montalcini",

                // Gordon Earle Moore - American engineer, Silicon Valley founding father, author of Moore's law. https://en.wikipedia.org/wiki/Gordon_Moore
                "moore",

                // Samuel Morse - contributed to the invention of a single-wire telegraph system based on European telegraphs and was a co-developer of the Morse code - https://en.wikipedia.org/wiki/Samuel_Morse
                "morse",

                // May-Britt Moser - Nobel prize winner neuroscientist who contributed to the discovery of grid cells in the brain. https://en.wikipedia.org/wiki/May-Britt_Moser
                "moser",

                // Ian Murdock - founder of the Debian project - https://en.wikipedia.org/wiki/Ian_Murdock
                "murdock",

                // John Napier of Merchiston - Scottish landowner known as an astronomer, mathematician and physicist. Best known for his discovery of logarithms. https://en.wikipedia.org/wiki/John_Napier
                "napier",

                // John Forbes Nash, Jr. - American mathematician who made fundamental contributions to game theory, differential geometry, and the study of partial differential equations. https://en.wikipedia.org/wiki/John_Forbes_Nash_Jr.
                "nash",

                // John von Neumann - todays computer architectures are based on the von Neumann architecture. https://en.wikipedia.org/wiki/Von_Neumann_architecture
                "neumann",

                // Isaac Newton invented classic mechanics and modern optics. https://en.wikipedia.org/wiki/Isaac_Newton
                "newton",

                // Florence Nightingale, more prominently known as a nurse, was also the first female member of the Royal Statistical Society and a pioneer in statistical graphics https://en.wikipedia.org/wiki/Florence_Nightingale#Statistics_and_sanitary_reform
                "nightingale",

                // Alfred Nobel - a Swedish chemist, engineer, innovator, and armaments manufacturer (inventor of dynamite) - https://en.wikipedia.org/wiki/Alfred_Nobel
                "nobel",

                // Emmy Noether, German mathematician. Noether's Theorem is named after her. https://en.wikipedia.org/wiki/Emmy_Noether
                "noether",

                // Poppy Northcutt. Poppy Northcutt was the first woman to work as part of NASA’s Mission Control. http://www.businessinsider.com/poppy-northcutt-helped-apollo-astronauts-2014-12?op=1
                "northcutt",

                // Robert Noyce invented silicon integrated circuits and gave Silicon Valley its name. - https://en.wikipedia.org/wiki/Robert_Noyce
                "noyce",

                // Panini - Ancient Indian linguist and grammarian from 4th century CE who worked on the world's first formal system - https://en.wikipedia.org/wiki/P%C4%81%E1%B9%87ini#Comparison_with_modern_formal_systems
                "panini",

                // Ambroise Pare invented modern surgery. https://en.wikipedia.org/wiki/Ambroise_Par%C3%A9
                "pare",

                // Blaise Pascal, French mathematician, physicist, and inventor - https://en.wikipedia.org/wiki/Blaise_Pascal
                "pascal",

                // Louis Pasteur discovered vaccination, fermentation and pasteurization. https://en.wikipedia.org/wiki/Louis_Pasteur.
                "pasteur",

                // Cecilia Payne-Gaposchkin was an astronomer and astrophysicist who, in 1925, proposed in her Ph.D. thesis an explanation for the composition of stars in terms of the relative abundances of hydrogen and helium. https://en.wikipedia.org/wiki/Cecilia_Payne-Gaposchkin
                "payne",

                // Radia Perlman is a software designer and network engineer and most famous for her invention of the spanning-tree protocol (STP). https://en.wikipedia.org/wiki/Radia_Perlman
                "perlman",

                // Rob Pike was a key contributor to Unix, Plan 9, the X graphic system, utf-8, and the Go programming language. https://en.wikipedia.org/wiki/Rob_Pike
                "pike",

                // Henri Poincaré made fundamental contributions in several fields of mathematics. https://en.wikipedia.org/wiki/Henri_Poincar%C3%A9
                "poincare",

                // Laura Poitras is a director and producer whose work, made possible by open source crypto tools, advances the causes of truth and freedom of information by reporting disclosures by whistleblowers such as Edward Snowden. https://en.wikipedia.org/wiki/Laura_Poitras
                "poitras",

                // Tat’yana Avenirovna Proskuriakova (Russian: Татья́на Авени́ровна Проскуряко́ва) (January 23 [O.S. January 10] 1909 – August 30, 1985) was a Russian-American Mayanist scholar and archaeologist who contributed significantly to the deciphering of Maya hieroglyphs, the writing system of the pre-Columbian Maya civilization of Mesoamerica. https://en.wikipedia.org/wiki/Tatiana_Proskouriakoff
                "proskuriakova",

                // Claudius Ptolemy - a Greco-Egyptian writer of Alexandria, known as a mathematician, astronomer, geographer, astrologer, and poet of a single epigram in the Greek Anthology - https://en.wikipedia.org/wiki/Ptolemy
                "ptolemy",

                // C. V. Raman - Indian physicist who won the Nobel Prize in 1930 for proposing the Raman effect. - https://en.wikipedia.org/wiki/C._V._Raman
                "raman",

                // Srinivasa Ramanujan - Indian mathematician and autodidact who made extraordinary contributions to mathematical analysis, number theory, infinite series, and continued fractions. - https://en.wikipedia.org/wiki/Srinivasa_Ramanujan
                "ramanujan",

                // Ida Rhodes - American pioneer in computer programming, designed the first computer used for Social Security. https://en.wikipedia.org/wiki/Ida_Rhodes
                "rhodes",

                // Sally Kristen Ride was an American physicist and astronaut. She was the first American woman in space, and the youngest American astronaut. https://en.wikipedia.org/wiki/Sally_Ride
                "ride",

                // Dennis Ritchie - co-creator of UNIX and the C programming language. - https://en.wikipedia.org/wiki/Dennis_Ritchie
                "ritchie",

                // Julia Hall Bowman Robinson - American mathematician renowned for her contributions to the fields of computability theory and computational complexity theory. https://en.wikipedia.org/wiki/Julia_Robinson
                "robinson",

                // Wilhelm Conrad Röntgen - German physicist who was awarded the first Nobel Prize in Physics in 1901 for the discovery of X-rays (Röntgen rays). https://en.wikipedia.org/wiki/Wilhelm_R%C3%B6ntgen
                "roentgen",

                // Rosalind Franklin - British biophysicist and X-ray crystallographer whose research was critical to the understanding of DNA - https://en.wikipedia.org/wiki/Rosalind_Franklin
                "rosalind",

                // Vera Rubin - American astronomer who pioneered work on galaxy rotation rates. https://en.wikipedia.org/wiki/Vera_Rubin
                "rubin",

                // Meghnad Saha - Indian astrophysicist best known for his development of the Saha equation, used to describe chemical and physical conditions in stars - https://en.wikipedia.org/wiki/Meghnad_Saha
                "saha",

                // Jean E. Sammet developed FORMAC, the first widely used computer language for symbolic manipulation of mathematical formulas. https://en.wikipedia.org/wiki/Jean_E._Sammet
                "sammet",

                // Mildred Sanderson - American mathematician best known for Sanderson's theorem concerning modular invariants. https://en.wikipedia.org/wiki/Mildred_Sanderson
                "sanderson",

                // Satoshi Nakamoto is the name used by the unknown person or group of people who developed bitcoin, authored the bitcoin white paper, and created and deployed bitcoin's original reference implementation. https://en.wikipedia.org/wiki/Satoshi_Nakamoto
                "satoshi",

                // Adi Shamir - Israeli cryptographer whose numerous inventions and contributions to cryptography include the Ferge Fiat Shamir identification scheme, the Rivest Shamir Adleman (RSA) public-key cryptosystem, the Shamir's secret sharing scheme, the breaking of the Merkle-Hellman cryptosystem, the TWINKLE and TWIRL factoring devices and the discovery of differential cryptanalysis (with Eli Biham). https://en.wikipedia.org/wiki/Adi_Shamir
                "shamir",

                // Claude Shannon - The father of information theory and founder of digital circuit design theory. (https://en.wikipedia.org/wiki/Claude_Shannon)
                "shannon",

                // Carol Shaw - Originally an Atari employee, Carol Shaw is said to be the first female video game designer. https://en.wikipedia.org/wiki/Carol_Shaw_(video_game_designer)
                "shaw",

                // Dame Stephanie "Steve" Shirley - Founded a software company in 1962 employing women working from home. https://en.wikipedia.org/wiki/Steve_Shirley
                "shirley",

                // William Shockley co-invented the transistor - https://en.wikipedia.org/wiki/William_Shockley
                "shockley",

                // Lina Solomonovna Stern (or Shtern; Russian: Лина Соломоновна Штерн; 26 August 1878 – 7 March 1968) was a Soviet biochemist, physiologist and humanist whose medical discoveries saved thousands of lives at the fronts of World War II. She is best known for her pioneering work on blood–brain barrier, which she described as hemato-encephalic barrier in 1921. https://en.wikipedia.org/wiki/Lina_Stern
                "shtern",

                // Françoise Barré-Sinoussi - French virologist and Nobel Prize Laureate in Physiology or Medicine; her work was fundamental in identifying HIV as the cause of AIDS. https://en.wikipedia.org/wiki/Fran%C3%A7oise_Barr%C3%A9-Sinoussi
                "sinoussi",

                // Betty Snyder - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Betty_Holberton
                "snyder",

                // Cynthia Solomon - Pioneer in the fields of artificial intelligence, computer science and educational computing. Known for creation of Logo, an educational programming language.  https://en.wikipedia.org/wiki/Cynthia_Solomon
                "solomon",

                // Frances Spence - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Frances_Spence
                "spence",

                // Michael Stonebraker is a database research pioneer and architect of Ingres, Postgres, VoltDB and SciDB. Winner of 2014 ACM Turing Award. https://en.wikipedia.org/wiki/Michael_Stonebraker
                "stonebraker",

                // Ivan Edward Sutherland - American computer scientist and Internet pioneer, widely regarded as the father of computer graphics. https://en.wikipedia.org/wiki/Ivan_Sutherland
                "sutherland",

                // Janese Swanson (with others) developed the first of the Carmen Sandiego games. She went on to found Girl Tech. https://en.wikipedia.org/wiki/Janese_Swanson
                "swanson",

                // Aaron Swartz was influential in creating RSS, Markdown, Creative Commons, Reddit, and much of the internet as we know it today. He was devoted to freedom of information on the web. https://en.wikiquote.org/wiki/Aaron_Swartz
                "swartz",

                // Bertha Swirles was a theoretical physicist who made a number of contributions to early quantum theory. https://en.wikipedia.org/wiki/Bertha_Swirles
                "swirles",

                // Helen Brooke Taussig - American cardiologist and founder of the field of paediatric cardiology. https://en.wikipedia.org/wiki/Helen_B._Taussig
                "taussig",

                // Nikola Tesla invented the AC electric system and every gadget ever used by a James Bond villain. https://en.wikipedia.org/wiki/Nikola_Tesla
                "tesla",

                // Marie Tharp - American geologist and oceanic cartographer who co-created the first scientific map of the Atlantic Ocean floor. Her work led to the acceptance of the theories of plate tectonics and continental drift. https://en.wikipedia.org/wiki/Marie_Tharp
                "tharp",

                // Ken Thompson - co-creator of UNIX and the C programming language - https://en.wikipedia.org/wiki/Ken_Thompson
                "thompson",

                // Linus Torvalds invented Linux and Git. https://en.wikipedia.org/wiki/Linus_Torvalds
                "torvalds",

                // Youyou Tu - Chinese pharmaceutical chemist and educator known for discovering artemisinin and dihydroartemisinin, used to treat malaria, which has saved millions of lives. Joint winner of the 2015 Nobel Prize in Physiology or Medicine. https://en.wikipedia.org/wiki/Tu_Youyou
                "tu",

                // Alan Turing was a founding father of computer science. https://en.wikipedia.org/wiki/Alan_Turing.
                "turing",

                // Varahamihira - Ancient Indian mathematician who discovered trigonometric formulae during 505-587 CE - https://en.wikipedia.org/wiki/Var%C4%81hamihira#Contributions
                "varahamihira",

                // Dorothy Vaughan was a NASA mathematician and computer programmer on the SCOUT launch vehicle program that put America's first satellites into space - https://en.wikipedia.org/wiki/Dorothy_Vaughan
                "vaughan",

                // Cédric Villani - French mathematician, won Fields Medal, Fermat Prize and Poincaré Price for his work in differential geometry and statistical mechanics. https://en.wikipedia.org/wiki/C%C3%A9dric_Villani
                "villani",

                // Sir Mokshagundam Visvesvaraya - is a notable Indian engineer.  He is a recipient of the Indian Republic's highest honour, the Bharat Ratna, in 1955. On his birthday, 15 September is celebrated as Engineer's Day in India in his memory - https://en.wikipedia.org/wiki/Visvesvaraya
                "visvesvaraya",

                // Christiane Nüsslein-Volhard - German biologist, won Nobel Prize in Physiology or Medicine in 1995 for research on the genetic control of embryonic development. https://en.wikipedia.org/wiki/Christiane_N%C3%BCsslein-Volhard
                "volhard",

                // Marlyn Wescoff - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Marlyn_Meltzer
                "wescoff",

                // Sylvia B. Wilbur - British computer scientist who helped develop the ARPANET, was one of the first to exchange email in the UK and a leading researcher in computer-supported collaborative work. https://en.wikipedia.org/wiki/Sylvia_Wilbur
                "wilbur",

                // Andrew Wiles - Notable British mathematician who proved the enigmatic Fermat's Last Theorem - https://en.wikipedia.org/wiki/Andrew_Wiles
                "wiles",

                // Roberta Williams, did pioneering work in graphical adventure games for personal computers, particularly the King's Quest series. https://en.wikipedia.org/wiki/Roberta_Williams
                "williams",

                // Malcolm John Williamson - British mathematician and cryptographer employed by the GCHQ. Developed in 1974 what is now known as Diffie-Hellman key exchange (Diffie and Hellman first published the scheme in 1976). https://en.wikipedia.org/wiki/Malcolm_J._Williamson
                "williamson",

                // Sophie Wilson designed the first Acorn Micro-Computer and the instruction set for ARM processors. https://en.wikipedia.org/wiki/Sophie_Wilson
                "wilson",

                // Jeannette Wing - co-developed the Liskov substitution principle. - https://en.wikipedia.org/wiki/Jeannette_Wing
                "wing",

                // Steve Wozniak invented the Apple I and Apple II. https://en.wikipedia.org/wiki/Steve_Wozniak
                "wozniak",

                // The Wright brothers, Orville and Wilbur - credited with inventing and building the world's first successful airplane and making the first controlled, powered and sustained heavier-than-air human flight - https://en.wikipedia.org/wiki/Wright_brothers
                "wright",

                // Chien-Shiung Wu - Chinese-American experimental physicist who made significant contributions to nuclear physics. https://en.wikipedia.org/wiki/Chien-Shiung_Wu
                "wu",

                // Rosalyn Sussman Yalow - Rosalyn Sussman Yalow was an American medical physicist, and a co-winner of the 1977 Nobel Prize in Physiology or Medicine for development of the radioimmunoassay technique. https://en.wikipedia.org/wiki/Rosalyn_Sussman_Yalow
                "yalow",

                // Ada Yonath - an Israeli crystallographer, the first woman from the Middle East to win a Nobel prize in the sciences. https://en.wikipedia.org/wiki/Ada_Yonath
                "yonath",

                // Nikolay Yegorovich Zhukovsky (Russian: Никола́й Его́рович Жуко́вский, January 17 1847 – March 17, 1921) was a Russian scientist, mathematician and engineer, and a founding father of modern aero- and hydrodynamics. Whereas contemporary scientists scoffed at the idea of human flight, Zhukovsky was the first to undertake the study of airflow. He is often called the Father of Russian Aviation. https://en.wikipedia.org/wiki/Nikolay_Yegorovich_Zhukovsky
                "zhukovsky",
        }
)

// GetRandomName generates a random name from the list of adjectives and surnames in this package
// formatted as "adjective_surname". For example 'focused_turing'. If retry is non-zero, a random
// integer between 0 and 10 will be added to the end of the name, e.g `focused_turing3`
func GetRandomName(retry int) string {
begin:
        name := left[rand.Intn(len(left))] + "_" + right[rand.Intn(len(right))] //nolint:gosec // G404: Use of weak random number generator (math/rand instead of crypto/rand)
        if name == "boring_wozniak" /* Steve Wozniak is not boring */ {
                goto begin
        }

        if retry > 0 {
                name += strconv.Itoa(rand.Intn(10)) //nolint:gosec // G404: Use of weak random number generator (math/rand instead of crypto/rand)
        }
        return name
}

//go:build !windows

// Package kernel provides helper function to get, parse and compare kernel
// versions for different platforms.
package kernel

import (
        "errors"
        "fmt"
)

// VersionInfo holds information about the kernel.
type VersionInfo struct {
        Kernel int    // Version of the kernel (e.g. 4.1.2-generic -> 4)
        Major  int    // Major part of the kernel version (e.g. 4.1.2-generic -> 1)
        Minor  int    // Minor part of the kernel version (e.g. 4.1.2-generic -> 2)
        Flavor string // Flavor of the kernel version (e.g. 4.1.2-generic -> generic)
}

func (k *VersionInfo) String() string {
        return fmt.Sprintf("%d.%d.%d%s", k.Kernel, k.Major, k.Minor, k.Flavor)
}

// CompareKernelVersion compares two kernel.VersionInfo structs.
// Returns -1 if a < b, 0 if a == b, 1 it a > b
func CompareKernelVersion(a, b VersionInfo) int {
        if a.Kernel < b.Kernel {
                return -1
        } else if a.Kernel > b.Kernel {
                return 1
        }

        if a.Major < b.Major {
                return -1
        } else if a.Major > b.Major {
                return 1
        }

        if a.Minor < b.Minor {
                return -1
        } else if a.Minor > b.Minor {
                return 1
        }

        return 0
}

// ParseRelease parses a string and creates a VersionInfo based on it.
func ParseRelease(release string) (*VersionInfo, error) {
        var (
                kernel, major, minor, parsed int
                flavor, partial              string
        )

        // Ignore error from Sscanf to allow an empty flavor.  Instead, just
        // make sure we got all the version numbers.
        parsed, _ = fmt.Sscanf(release, "%d.%d%s", &kernel, &major, &partial)
        if parsed < 2 {
                return nil, errors.New("Can't parse kernel version " + release)
        }

        // sometimes we have 3.12.25-gentoo, but sometimes we just have 3.12-1-amd64
        parsed, _ = fmt.Sscanf(partial, ".%d%s", &minor, &flavor)
        if parsed < 1 {
                flavor = partial
        }

        return &VersionInfo{
                Kernel: kernel,
                Major:  major,
                Minor:  minor,
                Flavor: flavor,
        }, nil
}

//go:build linux || freebsd || openbsd

package kernel

import (
        "context"

        "github.com/containerd/log"
        "golang.org/x/sys/unix"
)

// GetKernelVersion gets the current kernel version.
func GetKernelVersion() (*VersionInfo, error) {
        uts, err := uname()
        if err != nil {
                return nil, err
        }

        // Remove the \x00 from the release for Atoi to parse correctly
        return ParseRelease(unix.ByteSliceToString(uts.Release[:]))
}

// CheckKernelVersion checks if current kernel is newer than (or equal to)
// the given version.
func CheckKernelVersion(k, major, minor int) bool {
        if v, err := GetKernelVersion(); err != nil {
                log.G(context.TODO()).Warnf("error getting kernel version: %s", err)
        } else {
                if CompareKernelVersion(*v, VersionInfo{Kernel: k, Major: major, Minor: minor}) < 0 {
                        return false
                }
        }
        return true
}

package kernel

import "golang.org/x/sys/unix"

func uname() (*unix.Utsname, error) {
        uts := &unix.Utsname{}

        if err := unix.Uname(uts); err != nil {
                return nil, err
        }
        return uts, nil
}

package plugins

import (
        "bytes"
        "context"
        "encoding/json"
        "io"
        "net/http"
        "net/url"
        "time"

        "github.com/containerd/log"
        "github.com/docker/docker/pkg/ioutils"
        "github.com/docker/docker/pkg/plugins/transport"
        "github.com/docker/go-connections/sockets"
        "github.com/docker/go-connections/tlsconfig"
)

const (
        defaultTimeOut = 30 * time.Second

        // dummyHost is a hostname used for local communication.
        //
        // For local communications (npipe://, unix://), the hostname is not used,
        // but we need valid and meaningful hostname.
        dummyHost = "plugin.moby.localhost"
)

// VersionMimetype is the Content-Type the engine sends to plugins.
const VersionMimetype = transport.VersionMimetype

func newTransport(addr string, tlsConfig *tlsconfig.Options) (*transport.HTTPTransport, error) {
        tr := &http.Transport{}

        if tlsConfig != nil {
                c, err := tlsconfig.Client(*tlsConfig)
                if err != nil {
                        return nil, err
                }
                tr.TLSClientConfig = c
        }

        u, err := url.Parse(addr)
        if err != nil {
                return nil, err
        }
        socket := u.Host
        if socket == "" {
                // valid local socket addresses have the host empty.
                socket = u.Path
        }
        if err := sockets.ConfigureTransport(tr, u.Scheme, socket); err != nil {
                return nil, err
        }
        scheme := httpScheme(u)
        hostName := u.Host
        if hostName == "" || u.Scheme == "unix" || u.Scheme == "npipe" {
                // Override host header for non-tcp connections.
                hostName = dummyHost
        }
        return transport.NewHTTPTransport(tr, scheme, hostName), nil
}

// NewClient creates a new plugin client (http).
func NewClient(addr string, tlsConfig *tlsconfig.Options) (*Client, error) {
        clientTransport, err := newTransport(addr, tlsConfig)
        if err != nil {
                return nil, err
        }
        return newClientWithTransport(clientTransport, 0), nil
}

// NewClientWithTimeout creates a new plugin client (http).
func NewClientWithTimeout(addr string, tlsConfig *tlsconfig.Options, timeout time.Duration) (*Client, error) {
        clientTransport, err := newTransport(addr, tlsConfig)
        if err != nil {
                return nil, err
        }
        return newClientWithTransport(clientTransport, timeout), nil
}

// newClientWithTransport creates a new plugin client with a given transport.
func newClientWithTransport(tr *transport.HTTPTransport, timeout time.Duration) *Client {
        return &Client{
                http: &http.Client{
                        Transport: tr,
                        Timeout:   timeout,
                },
                requestFactory: tr,
        }
}

// requestFactory defines an interface that transports can implement to
// create new requests. It's used in testing.
type requestFactory interface {
        NewRequest(path string, data io.Reader) (*http.Request, error)
}

// Client represents a plugin client.
type Client struct {
        http           *http.Client // http client to use
        requestFactory requestFactory
}

// RequestOpts is the set of options that can be passed into a request
type RequestOpts struct {
        Timeout time.Duration

        // testTimeOut is used during tests to limit the max timeout in [abort]
        testTimeOut time.Duration
}

// WithRequestTimeout sets a timeout duration for plugin requests
func WithRequestTimeout(t time.Duration) func(*RequestOpts) {
        return func(o *RequestOpts) {
                o.Timeout = t
        }
}

// Call calls the specified method with the specified arguments for the plugin.
// It will retry for 30 seconds if a failure occurs when calling.
func (c *Client) Call(serviceMethod string, args, ret interface{}) error {
        return c.CallWithOptions(serviceMethod, args, ret)
}

// CallWithOptions is just like call except it takes options
func (c *Client) CallWithOptions(serviceMethod string, args interface{}, ret interface{}, opts ...func(*RequestOpts)) error {
        var buf bytes.Buffer
        if args != nil {
                if err := json.NewEncoder(&buf).Encode(args); err != nil {
                        return err
                }
        }
        body, err := c.callWithRetry(serviceMethod, &buf, true, opts...)
        if err != nil {
                return err
        }
        defer body.Close()
        if ret != nil {
                if err := json.NewDecoder(body).Decode(&ret); err != nil {
                        log.G(context.TODO()).Errorf("%s: error reading plugin resp: %v", serviceMethod, err)
                        return err
                }
        }
        return nil
}

// Stream calls the specified method with the specified arguments for the plugin and returns the response body
func (c *Client) Stream(serviceMethod string, args interface{}) (io.ReadCloser, error) {
        var buf bytes.Buffer
        if err := json.NewEncoder(&buf).Encode(args); err != nil {
                return nil, err
        }
        return c.callWithRetry(serviceMethod, &buf, true)
}

// SendFile calls the specified method, and passes through the IO stream
func (c *Client) SendFile(serviceMethod string, data io.Reader, ret interface{}) error {
        body, err := c.callWithRetry(serviceMethod, data, true)
        if err != nil {
                return err
        }
        defer body.Close()
        if err := json.NewDecoder(body).Decode(&ret); err != nil {
                log.G(context.TODO()).Errorf("%s: error reading plugin resp: %v", serviceMethod, err)
                return err
        }
        return nil
}

func (c *Client) callWithRetry(serviceMethod string, data io.Reader, retry bool, reqOpts ...func(*RequestOpts)) (io.ReadCloser, error) {
        var retries int
        start := time.Now()

        var opts RequestOpts
        for _, o := range reqOpts {
                o(&opts)
        }

        for {
                req, err := c.requestFactory.NewRequest(serviceMethod, data)
                if err != nil {
                        return nil, err
                }

                cancelRequest := func() {}
                if opts.Timeout > 0 {
                        var ctx context.Context
                        ctx, cancelRequest = context.WithTimeout(req.Context(), opts.Timeout)
                        req = req.WithContext(ctx)
                }

                resp, err := c.http.Do(req)
                if err != nil {
                        cancelRequest()
                        if !retry {
                                return nil, err
                        }

                        timeOff := backoff(retries)
                        if abort(start, timeOff, opts.testTimeOut) {
                                return nil, err
                        }
                        retries++
                        log.G(context.TODO()).Warnf("Unable to connect to plugin: %s%s: %v, retrying in %v", req.URL.Host, req.URL.Path, err, timeOff)
                        time.Sleep(timeOff)
                        continue
                }

                if resp.StatusCode != http.StatusOK {
                        b, err := io.ReadAll(resp.Body)
                        resp.Body.Close()
                        cancelRequest()
                        if err != nil {
                                return nil, &statusError{resp.StatusCode, serviceMethod, err.Error()}
                        }

                        // Plugins' Response(s) should have an Err field indicating what went
                        // wrong. Try to unmarshal into ResponseErr. Otherwise fallback to just
                        // return the string(body)
                        type responseErr struct {
                                Err string
                        }
                        remoteErr := responseErr{}
                        if err := json.Unmarshal(b, &remoteErr); err == nil {
                                if remoteErr.Err != "" {
                                        return nil, &statusError{resp.StatusCode, serviceMethod, remoteErr.Err}
                                }
                        }
                        // old way...
                        return nil, &statusError{resp.StatusCode, serviceMethod, string(b)}
                }
                return ioutils.NewReadCloserWrapper(resp.Body, func() error {
                        err := resp.Body.Close()
                        cancelRequest()
                        return err
                }), nil
        }
}

func backoff(retries int) time.Duration {
        b, maxTimeout := 1*time.Second, defaultTimeOut
        for b < maxTimeout && retries > 0 {
                b *= 2
                retries--
        }
        if b > maxTimeout {
                b = maxTimeout
        }
        return b
}

// testNonExistingPlugin is a special plugin-name, which overrides defaultTimeOut in tests.
const testNonExistingPlugin = "this-plugin-does-not-exist"

func abort(start time.Time, timeOff time.Duration, overrideTimeout time.Duration) bool {
        to := defaultTimeOut
        if overrideTimeout > 0 {
                to = overrideTimeout
        }
        return timeOff+time.Since(start) >= to
}

func httpScheme(u *url.URL) string {
        scheme := u.Scheme
        if scheme != "https" {
                scheme = "http"
        }
        return scheme
}

package plugins

import (
        "encoding/json"
        "io/fs"
        "net/url"
        "os"
        "path/filepath"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/moby/sys/userns"
        "github.com/pkg/errors"
)

// ErrNotFound plugin not found
var ErrNotFound = errors.New("plugin not found")

const defaultSocketsPath = "/run/docker/plugins"

// LocalRegistry defines a registry that is local (using unix socket).
type LocalRegistry struct {
        socketsPath string
        specsPaths  []string
}

func NewLocalRegistry() LocalRegistry {
        return LocalRegistry{
                socketsPath: defaultSocketsPath,
                specsPaths:  specsPaths(),
        }
}

// Scan scans all the plugin paths and returns all the names it found
func (l *LocalRegistry) Scan() ([]string, error) {
        var names []string
        dirEntries, err := os.ReadDir(l.socketsPath)
        if err != nil && !os.IsNotExist(err) {
                return nil, errors.Wrap(err, "error reading dir entries")
        }

        for _, entry := range dirEntries {
                if entry.IsDir() {
                        fi, err := os.Stat(filepath.Join(l.socketsPath, entry.Name(), entry.Name()+".sock"))
                        if err != nil {
                                continue
                        }

                        entry = fs.FileInfoToDirEntry(fi)
                }

                if entry.Type()&os.ModeSocket != 0 {
                        names = append(names, strings.TrimSuffix(filepath.Base(entry.Name()), filepath.Ext(entry.Name())))
                }
        }

        for _, p := range l.specsPaths {
                dirEntries, err = os.ReadDir(p)
                if err != nil {
                        if os.IsNotExist(err) {
                                continue
                        }
                        if os.IsPermission(err) && userns.RunningInUserNS() {
                                log.L.Debug(err.Error())
                                continue
                        }
                        return nil, errors.Wrap(err, "error reading dir entries")
                }
                for _, entry := range dirEntries {
                        if entry.IsDir() {
                                infos, err := os.ReadDir(filepath.Join(p, entry.Name()))
                                if err != nil {
                                        continue
                                }

                                for _, info := range infos {
                                        if strings.TrimSuffix(info.Name(), filepath.Ext(info.Name())) == entry.Name() {
                                                entry = info
                                                break
                                        }
                                }
                        }

                        switch ext := filepath.Ext(entry.Name()); ext {
                        case ".spec", ".json":
                                plugin := strings.TrimSuffix(entry.Name(), ext)
                                names = append(names, plugin)
                        default:
                        }
                }
        }
        return names, nil
}

// Plugin returns the plugin registered with the given name (or returns an error).
func (l *LocalRegistry) Plugin(name string) (*Plugin, error) {
        socketPaths := pluginPaths(l.socketsPath, name, ".sock")
        for _, p := range socketPaths {
                if fi, err := os.Stat(p); err == nil && fi.Mode()&os.ModeSocket != 0 {
                        return NewLocalPlugin(name, "unix://"+p), nil
                }
        }

        var txtSpecPaths []string
        for _, p := range l.specsPaths {
                txtSpecPaths = append(txtSpecPaths, pluginPaths(p, name, ".spec")...)
                txtSpecPaths = append(txtSpecPaths, pluginPaths(p, name, ".json")...)
        }

        for _, p := range txtSpecPaths {
                if _, err := os.Stat(p); err == nil {
                        if strings.HasSuffix(p, ".json") {
                                return readPluginJSONInfo(name, p)
                        }
                        return readPluginInfo(name, p)
                }
        }
        return nil, errors.Wrapf(ErrNotFound, "could not find plugin %s in v1 plugin registry", name)
}

// SpecsPaths returns paths in which to look for plugins, in order of priority.
//
// On Windows:
//
//   - "%programdata%\docker\plugins"
//
// On Unix in non-rootless mode:
//
//   - "/etc/docker/plugins"
//   - "/usr/lib/docker/plugins"
//
// On Unix in rootless-mode:
//
//   - "$XDG_CONFIG_HOME/docker/plugins" (or "/etc/docker/plugins" if $XDG_CONFIG_HOME is not set)
//   - "$HOME/.local/lib/docker/plugins" (pr "/usr/lib/docker/plugins" if $HOME is set)
func SpecsPaths() []string {
        return specsPaths()
}

func readPluginInfo(name, path string) (*Plugin, error) {
        content, err := os.ReadFile(path)
        if err != nil {
                return nil, err
        }
        addr := strings.TrimSpace(string(content))

        u, err := url.Parse(addr)
        if err != nil {
                return nil, err
        }

        if u.Scheme == "" {
                return nil, errors.New("Unknown protocol")
        }

        return NewLocalPlugin(name, addr), nil
}

func readPluginJSONInfo(name, path string) (*Plugin, error) {
        f, err := os.Open(path)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        var p Plugin
        if err := json.NewDecoder(f).Decode(&p); err != nil {
                return nil, err
        }
        p.name = name
        if p.TLSConfig != nil && p.TLSConfig.CAFile == "" {
                p.TLSConfig.InsecureSkipVerify = true
        }
        p.activateWait = sync.NewCond(&sync.Mutex{})

        return &p, nil
}

func pluginPaths(base, name, ext string) []string {
        return []string{
                filepath.Join(base, name+ext),
                filepath.Join(base, name, name+ext),
        }
}

//go:build !windows

package plugins

import (
        "path/filepath"

        "github.com/docker/docker/pkg/homedir"
        "github.com/docker/docker/pkg/rootless"
)

func rootlessConfigPluginsPath() string {
        if configHome, err := homedir.GetConfigHome(); err != nil {
                return filepath.Join(configHome, "docker/plugins")
        }
        return "/etc/docker/plugins"
}

func rootlessLibPluginsPath() string {
        if libHome, err := homedir.GetLibHome(); err == nil {
                return filepath.Join(libHome, "docker/plugins")
        }
        return "/usr/lib/docker/plugins"
}

// specsPaths is the non-Windows implementation of [SpecsPaths].
func specsPaths() []string {
        if rootless.RunningWithRootlessKit() {
                return []string{rootlessConfigPluginsPath(), rootlessLibPluginsPath()}
        }
        return []string{"/etc/docker/plugins", "/usr/lib/docker/plugins"}
}

package plugins

import (
        "fmt"
        "net/http"
)

type statusError struct {
        status int
        method string
        err    string
}

// Error returns a formatted string for this error type
func (e *statusError) Error() string {
        return fmt.Sprintf("%s: %v", e.method, e.err)
}

// IsNotFound indicates if the passed in error is from an http.StatusNotFound from the plugin
func IsNotFound(err error) bool {
        return isStatusError(err, http.StatusNotFound)
}

func isStatusError(err error, status int) bool {
        if err == nil {
                return false
        }
        e, ok := err.(*statusError)
        if !ok {
                return false
        }
        return e.status == status
}

// Package plugins provides structures and helper functions to manage Docker
// plugins.
//
// Docker discovers plugins by looking for them in the plugin directory whenever
// a user or container tries to use one by name. UNIX domain socket files must
// be located under /run/docker/plugins, whereas spec files can be located
// either under /etc/docker/plugins or /usr/lib/docker/plugins. This is handled
// by the Registry interface, which lets you list all plugins or get a plugin by
// its name if it exists.
//
// The plugins need to implement an HTTP server and bind this to the UNIX socket
// or the address specified in the spec files.
// A handshake is send at /Plugin.Activate, and plugins are expected to return
// a Manifest with a list of Docker subsystems which this plugin implements.
//
// In order to use a plugins, you can use the `Get` with the name of the
// plugin and the subsystem it implements.
//
//        plugin, err := plugins.Get("example", "VolumeDriver")
//        if err != nil {
//                return fmt.Errorf("Error looking up volume plugin example: %v", err)
//        }
package plugins

import (
        "context"
        "errors"
        "fmt"
        "sync"
        "time"

        "github.com/containerd/log"
        "github.com/docker/go-connections/tlsconfig"
)

// ProtocolSchemeHTTPV1 is the name of the protocol used for interacting with plugins using this package.
const ProtocolSchemeHTTPV1 = "moby.plugins.http/v1"

// ErrNotImplements is returned if the plugin does not implement the requested driver.
var ErrNotImplements = errors.New("Plugin does not implement the requested driver")

type plugins struct {
        sync.Mutex
        plugins map[string]*Plugin
}

type extpointHandlers struct {
        sync.RWMutex
        extpointHandlers map[string][]func(string, *Client)
}

var (
        storage  = plugins{plugins: make(map[string]*Plugin)}
        handlers = extpointHandlers{extpointHandlers: make(map[string][]func(string, *Client))}
)

// Manifest lists what a plugin implements.
type Manifest struct {
        // List of subsystem the plugin implements.
        Implements []string
}

// Plugin is the definition of a docker plugin.
type Plugin struct {
        // Name of the plugin
        name string
        // Address of the plugin
        Addr string
        // TLS configuration of the plugin
        TLSConfig *tlsconfig.Options
        // Client attached to the plugin
        client *Client
        // Manifest of the plugin (see above)
        Manifest *Manifest `json:"-"`

        // wait for activation to finish
        activateWait *sync.Cond
        // error produced by activation
        activateErr error
        // keeps track of callback handlers run against this plugin
        handlersRun bool
}

// Name returns the name of the plugin.
func (p *Plugin) Name() string {
        return p.name
}

// Client returns a ready-to-use plugin client that can be used to communicate with the plugin.
func (p *Plugin) Client() *Client {
        return p.client
}

// Protocol returns the protocol name/version used for plugins in this package.
func (p *Plugin) Protocol() string {
        return ProtocolSchemeHTTPV1
}

// IsV1 returns true for V1 plugins and false otherwise.
func (p *Plugin) IsV1() bool {
        return true
}

// ScopedPath returns the path scoped to the plugin's rootfs.
// For v1 plugins, this always returns the path unchanged as v1 plugins run directly on the host.
func (p *Plugin) ScopedPath(s string) string {
        return s
}

// NewLocalPlugin creates a new local plugin.
func NewLocalPlugin(name, addr string) *Plugin {
        return &Plugin{
                name: name,
                Addr: addr,
                // TODO: change to nil
                TLSConfig:    &tlsconfig.Options{InsecureSkipVerify: true},
                activateWait: sync.NewCond(&sync.Mutex{}),
        }
}

func (p *Plugin) activate() error {
        p.activateWait.L.Lock()

        if p.activated() {
                p.runHandlers()
                p.activateWait.L.Unlock()
                return p.activateErr
        }

        p.activateErr = p.activateWithLock()

        p.runHandlers()
        p.activateWait.L.Unlock()
        p.activateWait.Broadcast()
        return p.activateErr
}

// runHandlers runs the registered handlers for the implemented plugin types
// This should only be run after activation, and while the activation lock is held.
func (p *Plugin) runHandlers() {
        if !p.activated() {
                return
        }

        handlers.RLock()
        if !p.handlersRun {
                for _, iface := range p.Manifest.Implements {
                        hdlrs, handled := handlers.extpointHandlers[iface]
                        if !handled {
                                continue
                        }
                        for _, handler := range hdlrs {
                                handler(p.name, p.client)
                        }
                }
                p.handlersRun = true
        }
        handlers.RUnlock()
}

// activated returns if the plugin has already been activated.
// This should only be called with the activation lock held
func (p *Plugin) activated() bool {
        return p.Manifest != nil
}

func (p *Plugin) activateWithLock() error {
        c, err := NewClient(p.Addr, p.TLSConfig)
        if err != nil {
                return err
        }
        p.client = c

        m := new(Manifest)
        if err = p.client.Call("Plugin.Activate", nil, m); err != nil {
                return err
        }

        p.Manifest = m
        return nil
}

func (p *Plugin) waitActive() error {
        p.activateWait.L.Lock()
        for !p.activated() && p.activateErr == nil {
                p.activateWait.Wait()
        }
        p.activateWait.L.Unlock()
        return p.activateErr
}

func (p *Plugin) implements(kind string) bool {
        if p.Manifest == nil {
                return false
        }
        for _, driver := range p.Manifest.Implements {
                if driver == kind {
                        return true
                }
        }
        return false
}

func loadWithRetry(name string, retry bool) (*Plugin, error) {
        registry := NewLocalRegistry()
        start := time.Now()
        var testTimeOut time.Duration
        if name == testNonExistingPlugin {
                // override the timeout in tests
                testTimeOut = 2 * time.Second
        }
        var retries int
        for {
                plugin, err := registry.Plugin(name)
                if err != nil {
                        if !retry {
                                return nil, err
                        }

                        timeOff := backoff(retries)
                        if abort(start, timeOff, testTimeOut) {
                                return nil, err
                        }
                        retries++
                        log.G(context.TODO()).Warnf("Unable to locate plugin: %s, retrying in %v", name, timeOff)
                        time.Sleep(timeOff)
                        continue
                }

                storage.Lock()
                if pl, exists := storage.plugins[name]; exists {
                        storage.Unlock()
                        return pl, pl.activate()
                }
                storage.plugins[name] = plugin
                storage.Unlock()

                err = plugin.activate()
                if err != nil {
                        storage.Lock()
                        delete(storage.plugins, name)
                        storage.Unlock()
                }

                return plugin, err
        }
}

func get(name string) (*Plugin, error) {
        storage.Lock()
        pl, ok := storage.plugins[name]
        storage.Unlock()
        if ok {
                return pl, pl.activate()
        }
        return loadWithRetry(name, true)
}

// Get returns the plugin given the specified name and requested implementation.
func Get(name, imp string) (*Plugin, error) {
        if name == "" {
                return nil, errors.New("Unable to find plugin without name")
        }
        pl, err := get(name)
        if err != nil {
                return nil, err
        }
        if err := pl.waitActive(); err == nil && pl.implements(imp) {
                log.G(context.TODO()).Debugf("%s implements: %s", name, imp)
                return pl, nil
        }
        return nil, fmt.Errorf("%w: plugin=%q, requested implementation=%q", ErrNotImplements, name, imp)
}

// Handle adds the specified function to the extpointHandlers.
func Handle(iface string, fn func(string, *Client)) {
        handlers.Lock()
        hdlrs, ok := handlers.extpointHandlers[iface]
        if !ok {
                hdlrs = []func(string, *Client){}
        }

        hdlrs = append(hdlrs, fn)
        handlers.extpointHandlers[iface] = hdlrs

        storage.Lock()
        for _, p := range storage.plugins {
                p.activateWait.L.Lock()
                if p.activated() && p.implements(iface) {
                        p.handlersRun = false
                }
                p.activateWait.L.Unlock()
        }
        storage.Unlock()

        handlers.Unlock()
}

// GetAll returns all the plugins for the specified implementation
func (l *LocalRegistry) GetAll(imp string) ([]*Plugin, error) {
        pluginNames, err := l.Scan()
        if err != nil {
                return nil, err
        }

        type plLoad struct {
                pl  *Plugin
                err error
        }

        chPl := make(chan *plLoad, len(pluginNames))
        var wg sync.WaitGroup
        for _, name := range pluginNames {
                storage.Lock()
                pl, ok := storage.plugins[name]
                storage.Unlock()
                if ok {
                        chPl <- &plLoad{pl, nil}
                        continue
                }

                wg.Add(1)
                go func(name string) {
                        defer wg.Done()
                        plg, err := loadWithRetry(name, false)
                        chPl <- &plLoad{plg, err}
                }(name)
        }

        wg.Wait()
        close(chPl)

        var out []*Plugin
        for pl := range chPl {
                if pl.err != nil {
                        log.G(context.TODO()).Error(pl.err)
                        continue
                }
                if err := pl.pl.waitActive(); err == nil && pl.pl.implements(imp) {
                        out = append(out, pl.pl)
                }
        }
        return out, nil
}

package transport

import (
        "io"
        "net/http"
        "strings"
)

// HTTPTransport holds an [http.RoundTripper]
// and information about the scheme and address the transport
// sends request to.
type HTTPTransport struct {
        http.RoundTripper
        scheme string
        addr   string
}

// NewHTTPTransport creates a new HTTPTransport.
func NewHTTPTransport(r http.RoundTripper, scheme, addr string) *HTTPTransport {
        return &HTTPTransport{
                RoundTripper: r,
                scheme:       scheme,
                addr:         addr,
        }
}

// NewRequest creates a new http.Request and sets the URL
// scheme and address with the transport's fields.
func (t HTTPTransport) NewRequest(path string, data io.Reader) (*http.Request, error) {
        if !strings.HasPrefix(path, "/") {
                path = "/" + path
        }
        req, err := http.NewRequest(http.MethodPost, path, data)
        if err != nil {
                return nil, err
        }
        req.Header.Add("Accept", VersionMimetype)
        req.URL.Scheme = t.scheme
        req.URL.Host = t.addr
        return req, nil
}

// Package pools provides a collection of pools which provide various
// data types with buffers. These can be used to lower the number of
// memory allocations and reuse buffers.
//
// New pools should be added to this package to allow them to be
// shared across packages.
//
// Utility functions which operate on pools should be added to this
// package to allow them to be reused.
package pools

import (
        "bufio"
        "io"
        "sync"

        "github.com/docker/docker/pkg/ioutils"
)

const buffer32K = 32 * 1024

var (
        // BufioReader32KPool is a pool which returns bufio.Reader with a 32K buffer.
        BufioReader32KPool = newBufioReaderPoolWithSize(buffer32K)
        // BufioWriter32KPool is a pool which returns bufio.Writer with a 32K buffer.
        BufioWriter32KPool = newBufioWriterPoolWithSize(buffer32K)
        buffer32KPool      = newBufferPoolWithSize(buffer32K)
)

// BufioReaderPool is a bufio reader that uses sync.Pool.
type BufioReaderPool struct {
        pool sync.Pool
}

// newBufioReaderPoolWithSize is unexported because new pools should be
// added here to be shared where required.
func newBufioReaderPoolWithSize(size int) *BufioReaderPool {
        return &BufioReaderPool{
                pool: sync.Pool{
                        New: func() interface{} { return bufio.NewReaderSize(nil, size) },
                },
        }
}

// Get returns a bufio.Reader which reads from r. The buffer size is that of the pool.
func (bufPool *BufioReaderPool) Get(r io.Reader) *bufio.Reader {
        buf := bufPool.pool.Get().(*bufio.Reader)
        buf.Reset(r)
        return buf
}

// Put puts the bufio.Reader back into the pool.
func (bufPool *BufioReaderPool) Put(b *bufio.Reader) {
        b.Reset(nil)
        bufPool.pool.Put(b)
}

type bufferPool struct {
        pool sync.Pool
}

func newBufferPoolWithSize(size int) *bufferPool {
        return &bufferPool{
                pool: sync.Pool{
                        New: func() interface{} { s := make([]byte, size); return &s },
                },
        }
}

func (bp *bufferPool) Get() *[]byte {
        return bp.pool.Get().(*[]byte)
}

func (bp *bufferPool) Put(b *[]byte) {
        bp.pool.Put(b)
}

// Copy is a convenience wrapper which uses a buffer to avoid allocation in io.Copy.
func Copy(dst io.Writer, src io.Reader) (written int64, _ error) {
        buf := buffer32KPool.Get()
        written, err := io.CopyBuffer(dst, src, *buf)
        buffer32KPool.Put(buf)
        return written, err
}

// NewReadCloserWrapper returns a wrapper which puts the bufio.Reader back
// into the pool and closes the reader if it's an io.ReadCloser.
func (bufPool *BufioReaderPool) NewReadCloserWrapper(buf *bufio.Reader, r io.Reader) io.ReadCloser {
        return ioutils.NewReadCloserWrapper(r, func() error {
                if readCloser, ok := r.(io.ReadCloser); ok {
                        _ = readCloser.Close()
                }
                bufPool.Put(buf)
                return nil
        })
}

// BufioWriterPool is a bufio writer that uses sync.Pool.
type BufioWriterPool struct {
        pool sync.Pool
}

// newBufioWriterPoolWithSize is unexported because new pools should be
// added here to be shared where required.
func newBufioWriterPoolWithSize(size int) *BufioWriterPool {
        return &BufioWriterPool{
                pool: sync.Pool{
                        New: func() interface{} { return bufio.NewWriterSize(nil, size) },
                },
        }
}

// Get returns a bufio.Writer which writes to w. The buffer size is that of the pool.
func (bufPool *BufioWriterPool) Get(w io.Writer) *bufio.Writer {
        buf := bufPool.pool.Get().(*bufio.Writer)
        buf.Reset(w)
        return buf
}

// Put puts the bufio.Writer back into the pool.
func (bufPool *BufioWriterPool) Put(b *bufio.Writer) {
        b.Reset(nil)
        bufPool.pool.Put(b)
}

// NewWriteCloserWrapper returns a wrapper which puts the bufio.Writer back
// into the pool and closes the writer if it's an io.WriteCloser.
func (bufPool *BufioWriterPool) NewWriteCloserWrapper(buf *bufio.Writer, w io.Writer) io.WriteCloser {
        return ioutils.NewWriteCloserWrapper(w, func() error {
                _ = buf.Flush()
                if writeCloser, ok := w.(io.WriteCloser); ok {
                        _ = writeCloser.Close()
                }
                bufPool.Put(buf)
                return nil
        })
}

package progress

import (
        "fmt"
)

// Progress represents the progress of a transfer.
type Progress struct {
        ID string

        // Progress contains a Message or...
        Message string

        // ...progress of an action
        Action  string
        Current int64
        Total   int64

        // If true, don't show xB/yB
        HideCounts bool
        // If not empty, use units instead of bytes for counts
        Units string

        // Aux contains extra information not presented to the user, such as
        // digests for push signing.
        Aux interface{}

        LastUpdate bool
}

// Output is an interface for writing progress information. It's
// like a writer for progress, but we don't call it Writer because
// that would be confusing next to ProgressReader (also, because it
// doesn't implement the io.Writer interface).
type Output interface {
        WriteProgress(Progress) error
}

type chanOutput chan<- Progress

func (out chanOutput) WriteProgress(p Progress) error {
        // FIXME: workaround for panic in #37735
        defer func() {
                recover()
        }()
        out <- p
        return nil
}

// ChanOutput returns an Output that writes progress updates to the
// supplied channel.
func ChanOutput(progressChan chan<- Progress) Output {
        return chanOutput(progressChan)
}

type discardOutput struct{}

func (discardOutput) WriteProgress(Progress) error {
        return nil
}

// DiscardOutput returns an Output that discards progress
func DiscardOutput() Output {
        return discardOutput{}
}

// Update is a convenience function to write a progress update to the channel.
func Update(out Output, id, action string) {
        out.WriteProgress(Progress{ID: id, Action: action})
}

// Updatef is a convenience function to write a printf-formatted progress update
// to the channel.
func Updatef(out Output, id, format string, a ...interface{}) {
        Update(out, id, fmt.Sprintf(format, a...))
}

// Message is a convenience function to write a progress message to the channel.
func Message(out Output, id, message string) {
        out.WriteProgress(Progress{ID: id, Message: message})
}

// Messagef is a convenience function to write a printf-formatted progress
// message to the channel.
func Messagef(out Output, id, format string, a ...interface{}) {
        Message(out, id, fmt.Sprintf(format, a...))
}

// Aux sends auxiliary information over a progress interface, which will not be
// formatted for the UI. This is used for things such as push signing.
func Aux(out Output, a interface{}) {
        out.WriteProgress(Progress{Aux: a})
}

package progress

import (
        "io"
        "time"

        "golang.org/x/time/rate"
)

// Reader is a Reader with progress bar.
type Reader struct {
        in          io.ReadCloser // Stream to read from
        out         Output        // Where to send progress bar to
        size        int64
        current     int64
        lastUpdate  int64
        id          string
        action      string
        rateLimiter *rate.Limiter
}

// NewProgressReader creates a new ProgressReader.
func NewProgressReader(in io.ReadCloser, out Output, size int64, id, action string) *Reader {
        return &Reader{
                in:          in,
                out:         out,
                size:        size,
                id:          id,
                action:      action,
                rateLimiter: rate.NewLimiter(rate.Every(100*time.Millisecond), 1),
        }
}

func (p *Reader) Read(buf []byte) (int, error) {
        read, err := p.in.Read(buf)
        p.current += int64(read)
        updateEvery := int64(1024 * 512) // 512kB
        if p.size > 0 {
                // Update progress for every 1% read if 1% < 512kB
                if increment := int64(0.01 * float64(p.size)); increment < updateEvery {
                        updateEvery = increment
                }
        }
        if p.current-p.lastUpdate > updateEvery || err != nil {
                p.updateProgress(err != nil && read == 0)
                p.lastUpdate = p.current
        }

        return read, err
}

// Close closes the progress reader and its underlying reader.
func (p *Reader) Close() error {
        if p.current < p.size {
                // print a full progress bar when closing prematurely
                p.current = p.size
                p.updateProgress(false)
        }
        return p.in.Close()
}

func (p *Reader) updateProgress(last bool) {
        if last || p.current == p.size || p.rateLimiter.Allow() {
                p.out.WriteProgress(Progress{ID: p.id, Action: p.action, Current: p.current, Total: p.size, LastUpdate: last})
        }
}

package rootless

import "os"

// RunningWithRootlessKit returns true if running under RootlessKit namespaces.
func RunningWithRootlessKit() bool {
        return os.Getenv("ROOTLESSKIT_STATE_DIR") != ""
}

// Package streamformatter provides helper functions to format a stream.
package streamformatter

import (
        "encoding/json"
        "fmt"
        "io"
        "sync"

        "github.com/docker/docker/pkg/jsonmessage"
        "github.com/docker/docker/pkg/progress"
)

const streamNewline = "\r\n"

type jsonProgressFormatter struct{}

func appendNewline(source []byte) []byte {
        return append(source, []byte(streamNewline)...)
}

// FormatStatus formats the specified objects according to the specified format (and id).
func FormatStatus(id, format string, a ...interface{}) []byte {
        str := fmt.Sprintf(format, a...)
        b, err := json.Marshal(&jsonmessage.JSONMessage{ID: id, Status: str})
        if err != nil {
                return FormatError(err)
        }
        return appendNewline(b)
}

// FormatError formats the error as a JSON object
func FormatError(err error) []byte {
        jsonError, ok := err.(*jsonmessage.JSONError)
        if !ok {
                jsonError = &jsonmessage.JSONError{Message: err.Error()}
        }
        if b, err := json.Marshal(&jsonmessage.JSONMessage{Error: jsonError, ErrorMessage: err.Error()}); err == nil {
                return appendNewline(b)
        }
        return []byte(`{"error":"format error"}` + streamNewline)
}

func (sf *jsonProgressFormatter) formatStatus(id, format string, a ...interface{}) []byte {
        return FormatStatus(id, format, a...)
}

// formatProgress formats the progress information for a specified action.
func (sf *jsonProgressFormatter) formatProgress(id, action string, progress *jsonmessage.JSONProgress, aux interface{}) []byte {
        if progress == nil {
                progress = &jsonmessage.JSONProgress{}
        }
        var auxJSON *json.RawMessage
        if aux != nil {
                auxJSONBytes, err := json.Marshal(aux)
                if err != nil {
                        return nil
                }
                auxJSON = new(json.RawMessage)
                *auxJSON = auxJSONBytes
        }
        b, err := json.Marshal(&jsonmessage.JSONMessage{
                Status:          action,
                ProgressMessage: progress.String(),
                Progress:        progress,
                ID:              id,
                Aux:             auxJSON,
        })
        if err != nil {
                return nil
        }
        return appendNewline(b)
}

type rawProgressFormatter struct{}

func (sf *rawProgressFormatter) formatStatus(id, format string, a ...interface{}) []byte {
        return []byte(fmt.Sprintf(format, a...) + streamNewline)
}

func (sf *rawProgressFormatter) formatProgress(id, action string, progress *jsonmessage.JSONProgress, aux interface{}) []byte {
        if progress == nil {
                progress = &jsonmessage.JSONProgress{}
        }
        endl := "\r"
        if progress.String() == "" {
                endl += "\n"
        }
        return []byte(action + " " + progress.String() + endl)
}

// NewProgressOutput returns a progress.Output object that can be passed to
// progress.NewProgressReader.
func NewProgressOutput(out io.Writer) progress.Output {
        return &progressOutput{sf: &rawProgressFormatter{}, out: out, newLines: true}
}

// NewJSONProgressOutput returns a progress.Output that formats output
// using JSON objects
func NewJSONProgressOutput(out io.Writer, newLines bool) progress.Output {
        return &progressOutput{sf: &jsonProgressFormatter{}, out: out, newLines: newLines}
}

type formatProgress interface {
        formatStatus(id, format string, a ...interface{}) []byte
        formatProgress(id, action string, progress *jsonmessage.JSONProgress, aux interface{}) []byte
}

type progressOutput struct {
        sf       formatProgress
        out      io.Writer
        newLines bool
        mu       sync.Mutex
}

// WriteProgress formats progress information from a ProgressReader.
func (out *progressOutput) WriteProgress(prog progress.Progress) error {
        var formatted []byte
        if prog.Message != "" {
                formatted = out.sf.formatStatus(prog.ID, prog.Message)
        } else {
                jsonProgress := jsonmessage.JSONProgress{Current: prog.Current, Total: prog.Total, HideCounts: prog.HideCounts, Units: prog.Units}
                formatted = out.sf.formatProgress(prog.ID, prog.Action, &jsonProgress, prog.Aux)
        }

        out.mu.Lock()
        defer out.mu.Unlock()
        _, err := out.out.Write(formatted)
        if err != nil {
                return err
        }

        if out.newLines && prog.LastUpdate {
                _, err = out.out.Write(out.sf.formatStatus("", ""))
                return err
        }

        return nil
}

// AuxFormatter is a streamFormatter that writes aux progress messages
type AuxFormatter struct {
        io.Writer
}

// Emit emits the given interface as an aux progress message
func (sf *AuxFormatter) Emit(id string, aux interface{}) error {
        auxJSONBytes, err := json.Marshal(aux)
        if err != nil {
                return err
        }
        auxJSON := new(json.RawMessage)
        *auxJSON = auxJSONBytes
        msgJSON, err := json.Marshal(&jsonmessage.JSONMessage{ID: id, Aux: auxJSON})
        if err != nil {
                return err
        }
        msgJSON = appendNewline(msgJSON)
        n, err := sf.Writer.Write(msgJSON)
        if n != len(msgJSON) {
                return io.ErrShortWrite
        }
        return err
}

package streamformatter

import (
        "encoding/json"
        "io"

        "github.com/docker/docker/pkg/jsonmessage"
)

type streamWriter struct {
        io.Writer
        lineFormat func([]byte) string
}

func (sw *streamWriter) Write(buf []byte) (int, error) {
        formattedBuf := sw.format(buf)
        n, err := sw.Writer.Write(formattedBuf)
        if n != len(formattedBuf) {
                return n, io.ErrShortWrite
        }
        return len(buf), err
}

func (sw *streamWriter) format(buf []byte) []byte {
        msg := &jsonmessage.JSONMessage{Stream: sw.lineFormat(buf)}
        b, err := json.Marshal(msg)
        if err != nil {
                return FormatError(err)
        }
        return appendNewline(b)
}

// NewStdoutWriter returns a writer which formats the output as json message
// representing stdout lines
func NewStdoutWriter(out io.Writer) io.Writer {
        return &streamWriter{Writer: out, lineFormat: func(buf []byte) string {
                return string(buf)
        }}
}

// NewStderrWriter returns a writer which formats the output as json message
// representing stderr lines
func NewStderrWriter(out io.Writer) io.Writer {
        return &streamWriter{Writer: out, lineFormat: func(buf []byte) string {
                return "\033[91m" + string(buf) + "\033[0m"
        }}
}

// Package stringid provides helper functions for dealing with string identifiers
package stringid

import (
        "crypto/rand"
        "encoding/hex"
        "strings"
)

const (
        shortLen = 12
        fullLen  = 64
)

// TruncateID returns a shorthand version of a string identifier for convenience.
// A collision with other shorthands is very unlikely, but possible.
// In case of a collision a lookup with TruncIndex.Get() will fail, and the caller
// will need to use a longer prefix, or the full-length Id.
func TruncateID(id string) string {
        if i := strings.IndexRune(id, ':'); i >= 0 {
                id = id[i+1:]
        }
        if len(id) > shortLen {
                id = id[:shortLen]
        }
        return id
}

// GenerateRandomID returns a unique, 64-character ID consisting of a-z, 0-9.
// It guarantees that the ID, when truncated ([TruncateID]) does not consist
// of numbers only, so that the truncated ID can be used as hostname for
// containers.
func GenerateRandomID() string {
        b := make([]byte, 32)
        for {
                if _, err := rand.Read(b); err != nil {
                        panic(err) // This shouldn't happen
                }
                id := hex.EncodeToString(b)

                // make sure that the truncated ID does not consist of only numeric
                // characters, as it's used as default hostname for containers.
                //
                // See:
                // - https://github.com/moby/moby/issues/3869
                // - https://bugzilla.redhat.com/show_bug.cgi?id=1059122
                if allNum(id[:shortLen]) {
                        // all numbers; try again
                        continue
                }
                return id
        }
}

// allNum checks whether id consists of only numbers (0-9).
func allNum(id string) bool {
        for _, c := range []byte(id) {
                if c > '9' || c < '0' {
                        return false
                }
        }
        return true
}

package system

import (
        "os"
        "syscall"
        "time"
        "unsafe"
)

// Used by Chtimes
var unixEpochTime, unixMaxTime time.Time

func init() {
        unixEpochTime = time.Unix(0, 0)
        if unsafe.Sizeof(syscall.Timespec{}.Nsec) == 8 {
                // This is a 64 bit timespec
                // os.Chtimes limits time to the following
                //
                // Note that this intentionally sets nsec (not sec), which sets both sec
                // and nsec internally in time.Unix();
                // https://github.com/golang/go/blob/go1.19.2/src/time/time.go#L1364-L1380
                unixMaxTime = time.Unix(0, 1<<63-1)
        } else {
                // This is a 32 bit timespec
                unixMaxTime = time.Unix(1<<31-1, 0)
        }
}

// Chtimes changes the access time and modified time of a file at the given path.
// If the modified time is prior to the Unix Epoch (unixMinTime), or after the
// end of Unix Time (unixEpochTime), os.Chtimes has undefined behavior. In this
// case, Chtimes defaults to Unix Epoch, just in case.
func Chtimes(name string, atime time.Time, mtime time.Time) error {
        if atime.Before(unixEpochTime) || atime.After(unixMaxTime) {
                atime = unixEpochTime
        }

        if mtime.Before(unixEpochTime) || mtime.After(unixMaxTime) {
                mtime = unixEpochTime
        }

        if err := os.Chtimes(name, atime, mtime); err != nil {
                return err
        }

        // Take platform specific action for setting create time.
        return setCTime(name, mtime)
}

//go:build !windows

package system

import (
        "time"
)

// setCTime will set the create time on a file. On Unix, the create
// time is updated as a side effect of setting the modified time, so
// no action is required.
func setCTime(path string, ctime time.Time) error {
        return nil
}

package system

import (
        "os"
        "path/filepath"
        "strings"
)

// IsAbs is a platform-agnostic wrapper for filepath.IsAbs.
//
// On Windows, golang filepath.IsAbs does not consider a path \windows\system32
// as absolute as it doesn't start with a drive-letter/colon combination. However,
// in docker we need to verify things such as WORKDIR /windows/system32 in
// a Dockerfile (which gets translated to \windows\system32 when being processed
// by the daemon). This SHOULD be treated as absolute from a docker processing
// perspective.
//
// Deprecated: this function was only used internally and will be removed in the next release.
func IsAbs(path string) bool {
        return filepath.IsAbs(path) || strings.HasPrefix(path, string(os.PathSeparator))
}

//go:build !windows

package system

import "os"

// MkdirAllWithACL is a wrapper for os.MkdirAll on unix systems.
func MkdirAllWithACL(path string, perm os.FileMode, _ string) error {
        return os.MkdirAll(path, perm)
}

//go:build linux || freebsd

package system

import (
        "errors"
        "syscall"

        "golang.org/x/sys/unix"
)

// LUtimesNano is used to change access and modification time of the specified path.
// It's used for symbol link file because unix.UtimesNano doesn't support a NOFOLLOW flag atm.
func LUtimesNano(path string, ts []syscall.Timespec) error {
        uts := []unix.Timespec{
                unix.NsecToTimespec(syscall.TimespecToNsec(ts[0])),
                unix.NsecToTimespec(syscall.TimespecToNsec(ts[1])),
        }
        err := unix.UtimesNanoAt(unix.AT_FDCWD, path, uts, unix.AT_SYMLINK_NOFOLLOW)
        if err != nil && !errors.Is(err, unix.ENOSYS) {
                return err
        }

        return nil
}

package system

type XattrError struct {
        Op   string
        Attr string
        Path string
        Err  error
}

func (e *XattrError) Error() string { return e.Op + " " + e.Attr + " " + e.Path + ": " + e.Err.Error() }

func (e *XattrError) Unwrap() error { return e.Err }

// Timeout reports whether this error represents a timeout.
func (e *XattrError) Timeout() bool {
        t, ok := e.Err.(interface{ Timeout() bool })
        return ok && t.Timeout()
}

package system

import (
        "errors"

        "golang.org/x/sys/unix"
)

// Lgetxattr retrieves the value of the extended attribute identified by attr
// and associated with the given path in the file system.
// It returns a nil slice and nil error if the xattr is not set.
func Lgetxattr(path string, attr string) ([]byte, error) {
        sysErr := func(err error) ([]byte, error) {
                return nil, &XattrError{Op: "lgetxattr", Attr: attr, Path: path, Err: err}
        }

        // Start with a 128 length byte array
        dest := make([]byte, 128)
        sz, errno := unix.Lgetxattr(path, attr, dest)

        for errors.Is(errno, unix.ERANGE) {
                // Buffer too small, use zero-sized buffer to get the actual size
                sz, errno = unix.Lgetxattr(path, attr, []byte{})
                if errno != nil {
                        return sysErr(errno)
                }
                dest = make([]byte, sz)
                sz, errno = unix.Lgetxattr(path, attr, dest)
        }

        switch {
        case errors.Is(errno, unix.ENODATA):
                return nil, nil
        case errno != nil:
                return sysErr(errno)
        }

        return dest[:sz], nil
}

// Lsetxattr sets the value of the extended attribute identified by attr
// and associated with the given path in the file system.
func Lsetxattr(path string, attr string, data []byte, flags int) error {
        err := unix.Lsetxattr(path, attr, data, flags)
        if err != nil {
                return &XattrError{Op: "lsetxattr", Attr: attr, Path: path, Err: err}
        }
        return nil
}

// Package tailfile provides helper functions to read the nth lines of any
// ReadSeeker.
package tailfile

import (
        "bufio"
        "bytes"
        "context"
        "errors"
        "io"
        "os"
)

const blockSize = 1024

var eol = []byte("\n")

// ErrNonPositiveLinesNumber is an error returned if the lines number was negative.
var ErrNonPositiveLinesNumber = errors.New("The number of lines to extract from the file must be positive")

// TailFile returns last n lines of the passed in file.
func TailFile(f *os.File, n int) ([][]byte, error) {
        size, err := f.Seek(0, io.SeekEnd)
        if err != nil {
                return nil, err
        }

        rAt := io.NewSectionReader(f, 0, size)
        r, nLines, err := NewTailReader(context.Background(), rAt, n)
        if err != nil {
                return nil, err
        }

        buf := make([][]byte, 0, nLines)
        s := bufio.NewScanner(r)

        for s.Scan() {
                buf = append(buf, s.Bytes())
        }
        return buf, nil
}

// SizeReaderAt is an interface used to get a ReaderAt as well as the size of the underlying reader.
// Note that the size of the underlying reader should not change when using this interface.
type SizeReaderAt interface {
        io.ReaderAt
        Size() int64
}

// NewTailReader scopes the passed in reader to just the last N lines passed in
func NewTailReader(ctx context.Context, r SizeReaderAt, reqLines int) (*io.SectionReader, int, error) {
        return NewTailReaderWithDelimiter(ctx, r, reqLines, eol)
}

// NewTailReaderWithDelimiter scopes the passed in reader to just the last N lines passed in
// In this case a "line" is defined by the passed in delimiter.
//
// Delimiter lengths should be generally small, no more than 12 bytes
func NewTailReaderWithDelimiter(ctx context.Context, r SizeReaderAt, reqLines int, delimiter []byte) (*io.SectionReader, int, error) {
        if reqLines < 1 {
                return nil, 0, ErrNonPositiveLinesNumber
        }
        if len(delimiter) == 0 {
                return nil, 0, errors.New("must provide a delimiter")
        }
        var (
                size      = r.Size()
                tailStart int64
                tailEnd   = size
                found     int
        )

        if int64(len(delimiter)) >= size {
                return io.NewSectionReader(bytes.NewReader(nil), 0, 0), 0, nil
        }

        s := newScanner(r, delimiter)
        for s.Scan(ctx) {
                if err := s.Err(); err != nil {
                        return nil, 0, s.Err()
                }

                found++
                if found == 1 {
                        tailEnd = s.End()
                }
                if found == reqLines {
                        break
                }
        }

        tailStart = s.Start(ctx)

        if found == 0 {
                return io.NewSectionReader(bytes.NewReader(nil), 0, 0), 0, nil
        }

        if found < reqLines && tailStart != 0 {
                tailStart = 0
        }
        return io.NewSectionReader(r, tailStart, tailEnd-tailStart), found, nil
}

func newScanner(r SizeReaderAt, delim []byte) *scanner {
        size := r.Size()
        readSize := blockSize
        if readSize > int(size) {
                readSize = int(size)
        }
        // silly case...
        if len(delim) >= readSize/2 {
                readSize = len(delim)*2 + 2
        }

        return &scanner{
                r:     r,
                pos:   size,
                buf:   make([]byte, readSize),
                delim: delim,
        }
}

type scanner struct {
        r     SizeReaderAt
        pos   int64
        buf   []byte
        delim []byte
        err   error
        idx   int
}

func (s *scanner) Start(ctx context.Context) int64 {
        if s.idx > 0 {
                idx := bytes.LastIndex(s.buf[:s.idx], s.delim)
                if idx >= 0 {
                        return s.pos + int64(idx) + int64(len(s.delim))
                }
        }

        // slow path
        buf := make([]byte, len(s.buf))
        copy(buf, s.buf)

        readAhead := &scanner{
                r:     s.r,
                pos:   s.pos,
                delim: s.delim,
                idx:   s.idx,
                buf:   buf,
        }

        if !readAhead.Scan(ctx) {
                return 0
        }
        return readAhead.End()
}

func (s *scanner) End() int64 {
        return s.pos + int64(s.idx) + int64(len(s.delim))
}

func (s *scanner) Err() error {
        return s.err
}

func (s *scanner) Scan(ctx context.Context) bool {
        if s.err != nil {
                return false
        }

        for {
                select {
                case <-ctx.Done():
                        s.err = ctx.Err()
                        return false
                default:
                }

                idx := s.idx - len(s.delim)
                if idx < 0 {
                        readSize := int(s.pos)
                        if readSize > len(s.buf) {
                                readSize = len(s.buf)
                        }

                        if readSize < len(s.delim) {
                                return false
                        }

                        offset := s.pos - int64(readSize)
                        n, err := s.r.ReadAt(s.buf[:readSize], offset)
                        if err != nil && !errors.Is(err, io.EOF) {
                                s.err = err
                                return false
                        }

                        s.pos -= int64(n)
                        idx = n
                }

                s.idx = bytes.LastIndex(s.buf[:idx], s.delim)
                if s.idx >= 0 {
                        return true
                }

                if len(s.delim) > 1 && s.pos > 0 {
                        // in this case, there may be a partial delimiter at the front of the buffer, so set the position forward
                        // up to the maximum size partial that could be there so it can be read again in the next iteration with any
                        // potential remainder.
                        // An example where delimiter is `####`:
                        // [##asdfqwerty]
                        //    ^
                        // This resets the position to where the arrow is pointing.
                        // It could actually check if a partial exists and at the front, but that is pretty similar to the indexing
                        // code above though a bit more complex since each byte has to be checked (`len(delimiter)-1`) factorial).
                        // It's much simpler and cleaner to just re-read `len(delimiter)-1` bytes again.
                        s.pos += int64(len(s.delim)) - 1
                }
        }
}

package tarsum

// BuilderContext is an interface extending TarSum by adding the Remove method.
// In general there was concern about adding this method to TarSum itself
// so instead it is being added just to "BuilderContext" which will then
// only be used during the .dockerignore file processing
// - see builder/evaluator.go
type BuilderContext interface {
        TarSum
        Remove(string)
}

func (ts *tarSum) Remove(filename string) {
        for i, fis := range ts.sums {
                if fis.Name() == filename {
                        ts.sums = append(ts.sums[:i], ts.sums[i+1:]...)
                        // Note, we don't just return because there could be
                        // more than one with this name
                }
        }
}

package tarsum

import (
        "runtime"
        "sort"
        "strings"
)

// FileInfoSumInterface provides an interface for accessing file checksum
// information within a tar file. This info is accessed through interface
// so the actual name and sum cannot be melded with.
type FileInfoSumInterface interface {
        // File name
        Name() string
        // Checksum of this particular file and its headers
        Sum() string
        // Position of file in the tar
        Pos() int64
}

type fileInfoSum struct {
        name string
        sum  string
        pos  int64
}

func (fis fileInfoSum) Name() string {
        return fis.name
}

func (fis fileInfoSum) Sum() string {
        return fis.sum
}

func (fis fileInfoSum) Pos() int64 {
        return fis.pos
}

// FileInfoSums provides a list of FileInfoSumInterfaces.
type FileInfoSums []FileInfoSumInterface

// GetFile returns the first FileInfoSumInterface with a matching name.
func (fis FileInfoSums) GetFile(name string) FileInfoSumInterface {
        // We do case insensitive matching on Windows as c:\APP and c:\app are
        // the same. See issue #33107.
        for i := range fis {
                if (runtime.GOOS == "windows" && strings.EqualFold(fis[i].Name(), name)) ||
                        (runtime.GOOS != "windows" && fis[i].Name() == name) {
                        return fis[i]
                }
        }
        return nil
}

// GetAllFile returns a FileInfoSums with all matching names.
func (fis FileInfoSums) GetAllFile(name string) FileInfoSums {
        f := FileInfoSums{}
        for i := range fis {
                if fis[i].Name() == name {
                        f = append(f, fis[i])
                }
        }
        return f
}

// GetDuplicatePaths returns a FileInfoSums with all duplicated paths.
func (fis FileInfoSums) GetDuplicatePaths() (dups FileInfoSums) {
        seen := make(map[string]int, len(fis)) // allocate earl. no need to grow this map.
        for i := range fis {
                f := fis[i]
                if _, ok := seen[f.Name()]; ok {
                        dups = append(dups, f)
                } else {
                        seen[f.Name()] = 0
                }
        }
        return dups
}

// Len returns the size of the FileInfoSums.
func (fis FileInfoSums) Len() int { return len(fis) }

// Swap swaps two FileInfoSum values if a FileInfoSums list.
func (fis FileInfoSums) Swap(i, j int) { fis[i], fis[j] = fis[j], fis[i] }

// SortByPos sorts FileInfoSums content by position.
func (fis FileInfoSums) SortByPos() {
        sort.Sort(byPos{fis})
}

// SortByNames sorts FileInfoSums content by name.
func (fis FileInfoSums) SortByNames() {
        sort.Sort(byName{fis})
}

// SortBySums sorts FileInfoSums content by sums.
func (fis FileInfoSums) SortBySums() {
        dups := fis.GetDuplicatePaths()
        if len(dups) > 0 {
                sort.Sort(bySum{fis, dups})
        } else {
                sort.Sort(bySum{fis, nil})
        }
}

// byName is a sort.Sort helper for sorting by file names.
// If names are the same, order them by their appearance in the tar archive
type byName struct{ FileInfoSums }

func (bn byName) Less(i, j int) bool {
        if bn.FileInfoSums[i].Name() == bn.FileInfoSums[j].Name() {
                return bn.FileInfoSums[i].Pos() < bn.FileInfoSums[j].Pos()
        }
        return bn.FileInfoSums[i].Name() < bn.FileInfoSums[j].Name()
}

// bySum is a sort.Sort helper for sorting by the sums of all the fileinfos in the tar archive
type bySum struct {
        FileInfoSums
        dups FileInfoSums
}

func (bs bySum) Less(i, j int) bool {
        if bs.dups != nil && bs.FileInfoSums[i].Name() == bs.FileInfoSums[j].Name() {
                return bs.FileInfoSums[i].Pos() < bs.FileInfoSums[j].Pos()
        }
        return bs.FileInfoSums[i].Sum() < bs.FileInfoSums[j].Sum()
}

// byPos is a sort.Sort helper for sorting by the sums of all the fileinfos by their original order
type byPos struct{ FileInfoSums }

func (bp byPos) Less(i, j int) bool {
        return bp.FileInfoSums[i].Pos() < bp.FileInfoSums[j].Pos()
}

// Package tarsum provides algorithms to perform checksum calculation on
// filesystem layers.
//
// The transportation of filesystems, regarding Docker, is done with tar(1)
// archives. There are a variety of tar serialization formats [2], and a key
// concern here is ensuring a repeatable checksum given a set of inputs from a
// generic tar archive. Types of transportation include distribution to and from a
// registry endpoint, saving and loading through commands or Docker daemon APIs,
// transferring the build context from client to Docker daemon, and committing the
// filesystem of a container to become an image.
//
// As tar archives are used for transit, but not preserved in many situations, the
// focus of the algorithm is to ensure the integrity of the preserved filesystem,
// while maintaining a deterministic accountability. This includes neither
// constraining the ordering or manipulation of the files during the creation or
// unpacking of the archive, nor include additional metadata state about the file
// system attributes.
package tarsum

import (
        "archive/tar"
        "bytes"
        "compress/gzip"
        "crypto"
        "crypto/sha256"
        "encoding/hex"
        "errors"
        "fmt"
        "hash"
        "io"
        "path"
        "strings"
)

const (
        buf8K  = 8 * 1024
        buf16K = 16 * 1024
        buf32K = 32 * 1024
)

// NewTarSum creates a new interface for calculating a fixed time checksum of a
// tar archive.
//
// This is used for calculating checksums of layers of an image, in some cases
// including the byte payload of the image's json metadata as well, and for
// calculating the checksums for buildcache.
func NewTarSum(r io.Reader, dc bool, v Version) (TarSum, error) {
        return NewTarSumHash(r, dc, v, DefaultTHash)
}

// NewTarSumHash creates a new TarSum, providing a THash to use rather than
// the DefaultTHash.
func NewTarSumHash(r io.Reader, dc bool, v Version, tHash THash) (TarSum, error) {
        headerSelector, err := getTarHeaderSelector(v)
        if err != nil {
                return nil, err
        }
        ts := &tarSum{Reader: r, DisableCompression: dc, tarSumVersion: v, headerSelector: headerSelector, tHash: tHash}
        err = ts.initTarSum()
        return ts, err
}

// NewTarSumForLabel creates a new TarSum using the provided TarSum version+hash label.
func NewTarSumForLabel(r io.Reader, disableCompression bool, label string) (TarSum, error) {
        versionName, hashName, ok := strings.Cut(label, "+")
        if !ok {
                return nil, errors.New("tarsum label string should be of the form: {tarsum_version}+{hash_name}")
        }

        version, ok := tarSumVersionsByName[versionName]
        if !ok {
                return nil, fmt.Errorf("unknown TarSum version name: %q", versionName)
        }

        hashConfig, ok := standardHashConfigs[hashName]
        if !ok {
                return nil, fmt.Errorf("unknown TarSum hash name: %q", hashName)
        }

        tHash := NewTHash(hashConfig.name, hashConfig.hash.New)

        return NewTarSumHash(r, disableCompression, version, tHash)
}

// TarSum is the generic interface for calculating fixed time
// checksums of a tar archive.
type TarSum interface {
        io.Reader
        GetSums() FileInfoSums
        Sum([]byte) string
        Version() Version
        Hash() THash
}

// tarSum struct is the structure for a Version0 checksum calculation.
type tarSum struct {
        io.Reader
        tarR               *tar.Reader
        tarW               *tar.Writer
        writer             writeCloseFlusher
        bufTar             *bytes.Buffer
        bufWriter          *bytes.Buffer
        bufData            []byte
        h                  hash.Hash
        tHash              THash
        sums               FileInfoSums
        fileCounter        int64
        currentFile        string
        finished           bool
        first              bool
        DisableCompression bool              // false by default. When false, the output gzip compressed.
        tarSumVersion      Version           // this field is not exported so it can not be mutated during use
        headerSelector     tarHeaderSelector // handles selecting and ordering headers for files in the archive
}

func (ts tarSum) Hash() THash {
        return ts.tHash
}

func (ts tarSum) Version() Version {
        return ts.tarSumVersion
}

// THash provides a hash.Hash type generator and its name.
type THash interface {
        Hash() hash.Hash
        Name() string
}

// NewTHash is a convenience method for creating a THash.
func NewTHash(name string, h func() hash.Hash) THash {
        return simpleTHash{n: name, h: h}
}

type tHashConfig struct {
        name string
        hash crypto.Hash
}

// NOTE: DO NOT include MD5 or SHA1, which are considered insecure.
var standardHashConfigs = map[string]tHashConfig{
        "sha256": {name: "sha256", hash: crypto.SHA256},
        "sha512": {name: "sha512", hash: crypto.SHA512},
}

// DefaultTHash is default TarSum hashing algorithm - "sha256".
var DefaultTHash = NewTHash("sha256", sha256.New)

type simpleTHash struct {
        n string
        h func() hash.Hash
}

func (sth simpleTHash) Name() string    { return sth.n }
func (sth simpleTHash) Hash() hash.Hash { return sth.h() }

func (ts *tarSum) encodeHeader(h *tar.Header) error {
        for _, elem := range ts.headerSelector.selectHeaders(h) {
                // Ignore these headers to be compatible with versions
                // before go 1.10
                if elem[0] == "gname" || elem[0] == "uname" {
                        elem[1] = ""
                }
                if _, err := ts.h.Write([]byte(elem[0] + elem[1])); err != nil {
                        return err
                }
        }
        return nil
}

func (ts *tarSum) initTarSum() error {
        ts.bufTar = bytes.NewBuffer([]byte{})
        ts.bufWriter = bytes.NewBuffer([]byte{})
        ts.tarR = tar.NewReader(ts.Reader)
        ts.tarW = tar.NewWriter(ts.bufTar)
        if !ts.DisableCompression {
                ts.writer = gzip.NewWriter(ts.bufWriter)
        } else {
                ts.writer = &nopCloseFlusher{Writer: ts.bufWriter}
        }
        if ts.tHash == nil {
                ts.tHash = DefaultTHash
        }
        ts.h = ts.tHash.Hash()
        ts.h.Reset()
        ts.first = true
        ts.sums = FileInfoSums{}
        return nil
}

func (ts *tarSum) Read(buf []byte) (int, error) {
        if ts.finished {
                return ts.bufWriter.Read(buf)
        }
        if len(ts.bufData) < len(buf) {
                switch {
                case len(buf) <= buf8K:
                        ts.bufData = make([]byte, buf8K)
                case len(buf) <= buf16K:
                        ts.bufData = make([]byte, buf16K)
                case len(buf) <= buf32K:
                        ts.bufData = make([]byte, buf32K)
                default:
                        ts.bufData = make([]byte, len(buf))
                }
        }
        buf2 := ts.bufData[:len(buf)]

        n, err := ts.tarR.Read(buf2)
        if err != nil {
                if errors.Is(err, io.EOF) {
                        if _, err := ts.h.Write(buf2[:n]); err != nil {
                                return 0, err
                        }
                        if !ts.first {
                                ts.sums = append(ts.sums, fileInfoSum{name: ts.currentFile, sum: hex.EncodeToString(ts.h.Sum(nil)), pos: ts.fileCounter})
                                ts.fileCounter++
                                ts.h.Reset()
                        } else {
                                ts.first = false
                        }

                        if _, err := ts.tarW.Write(buf2[:n]); err != nil {
                                return 0, err
                        }

                        currentHeader, err := ts.tarR.Next()
                        if err != nil {
                                if err == io.EOF {
                                        if err := ts.tarW.Close(); err != nil {
                                                return 0, err
                                        }
                                        if _, err := io.Copy(ts.writer, ts.bufTar); err != nil {
                                                return 0, err
                                        }
                                        if err := ts.writer.Close(); err != nil {
                                                return 0, err
                                        }
                                        ts.finished = true
                                        return ts.bufWriter.Read(buf)
                                }
                                return 0, err
                        }

                        // #nosec G305 -- The joined path is not passed to any filesystem APIs.
                        ts.currentFile = path.Join(".", path.Join("/", currentHeader.Name))
                        if err := ts.encodeHeader(currentHeader); err != nil {
                                return 0, err
                        }
                        if err := ts.tarW.WriteHeader(currentHeader); err != nil {
                                return 0, err
                        }

                        if _, err := io.Copy(ts.writer, ts.bufTar); err != nil {
                                return 0, err
                        }
                        ts.writer.Flush()

                        return ts.bufWriter.Read(buf)
                }
                return 0, err
        }

        // Filling the hash buffer
        if _, err = ts.h.Write(buf2[:n]); err != nil {
                return 0, err
        }

        // Filling the tar writer
        if _, err = ts.tarW.Write(buf2[:n]); err != nil {
                return 0, err
        }

        // Filling the output writer
        if _, err = io.Copy(ts.writer, ts.bufTar); err != nil {
                return 0, err
        }
        ts.writer.Flush()

        return ts.bufWriter.Read(buf)
}

func (ts *tarSum) Sum(extra []byte) string {
        ts.sums.SortBySums()
        h := ts.tHash.Hash()
        if extra != nil {
                h.Write(extra)
        }
        for _, fis := range ts.sums {
                h.Write([]byte(fis.Sum()))
        }
        checksum := ts.Version().String() + "+" + ts.tHash.Name() + ":" + hex.EncodeToString(h.Sum(nil))
        return checksum
}

func (ts *tarSum) GetSums() FileInfoSums {
        return ts.sums
}

package tarsum

import (
        "archive/tar"
        "errors"
        "io"
        "sort"
        "strconv"
        "strings"
)

// Version is used for versioning of the TarSum algorithm
// based on the prefix of the hash used
// i.e. "tarsum+sha256:e58fcf7418d4390dec8e8fb69d88c06ec07039d651fedd3aa72af9972e7d046b"
type Version int

// Prefix of "tarsum"
const (
        Version0 Version = iota
        Version1
        // VersionDev this constant will be either the latest or an unsettled next-version of the TarSum calculation
        VersionDev
)

// WriteV1Header writes a tar header to a writer in V1 tarsum format.
func WriteV1Header(h *tar.Header, w io.Writer) {
        for _, elem := range v1TarHeaderSelect(h) {
                w.Write([]byte(elem[0] + elem[1]))
        }
}

// VersionLabelForChecksum returns the label for the given tarsum
// checksum, i.e., everything before the first `+` character in
// the string or an empty string if no label separator is found.
func VersionLabelForChecksum(checksum string) string {
        // Checksums are in the form: {versionLabel}+{hashID}:{hex}
        sepIndex := strings.Index(checksum, "+")
        if sepIndex < 0 {
                return ""
        }
        return checksum[:sepIndex]
}

// GetVersions gets a list of all known tarsum versions.
func GetVersions() []Version {
        v := []Version{}
        for k := range tarSumVersions {
                v = append(v, k)
        }
        return v
}

var (
        tarSumVersions = map[Version]string{
                Version0:   "tarsum",
                Version1:   "tarsum.v1",
                VersionDev: "tarsum.dev",
        }
        tarSumVersionsByName = map[string]Version{
                "tarsum":     Version0,
                "tarsum.v1":  Version1,
                "tarsum.dev": VersionDev,
        }
)

func (tsv Version) String() string {
        return tarSumVersions[tsv]
}

// GetVersionFromTarsum returns the Version from the provided string.
func GetVersionFromTarsum(tarsum string) (Version, error) {
        versionName, _, _ := strings.Cut(tarsum, "+")
        version, ok := tarSumVersionsByName[versionName]
        if !ok {
                return -1, ErrNotVersion
        }
        return version, nil
}

// Errors that may be returned by functions in this package
var (
        ErrNotVersion            = errors.New("string does not include a TarSum Version")
        ErrVersionNotImplemented = errors.New("TarSum Version is not yet implemented")
)

// tarHeaderSelector is the interface which different versions
// of tarsum should use for selecting and ordering tar headers
// for each item in the archive.
type tarHeaderSelector interface {
        selectHeaders(h *tar.Header) (orderedHeaders [][2]string)
}

type tarHeaderSelectFunc func(h *tar.Header) (orderedHeaders [][2]string)

func (f tarHeaderSelectFunc) selectHeaders(h *tar.Header) (orderedHeaders [][2]string) {
        return f(h)
}

func v0TarHeaderSelect(h *tar.Header) (orderedHeaders [][2]string) {
        return [][2]string{
                {"name", h.Name},
                {"mode", strconv.FormatInt(h.Mode, 10)},
                {"uid", strconv.Itoa(h.Uid)},
                {"gid", strconv.Itoa(h.Gid)},
                {"size", strconv.FormatInt(h.Size, 10)},
                {"mtime", strconv.FormatInt(h.ModTime.UTC().Unix(), 10)},
                {"typeflag", string([]byte{h.Typeflag})},
                {"linkname", h.Linkname},
                {"uname", h.Uname},
                {"gname", h.Gname},
                {"devmajor", strconv.FormatInt(h.Devmajor, 10)},
                {"devminor", strconv.FormatInt(h.Devminor, 10)},
        }
}

func v1TarHeaderSelect(h *tar.Header) (orderedHeaders [][2]string) {
        // Get extended attributes.
        const paxSchilyXattr = "SCHILY.xattr."
        var xattrs [][2]string
        for k, v := range h.PAXRecords {
                if xattr, ok := strings.CutPrefix(k, paxSchilyXattr); ok {
                        // h.Xattrs keys take precedence over h.PAXRecords keys, like
                        // archive/tar does when writing.
                        if vv, ok := h.Xattrs[xattr]; ok { //nolint:staticcheck // field deprecated in stdlib
                                v = vv
                        }
                        xattrs = append(xattrs, [2]string{xattr, v})
                }
        }
        // Get extended attributes which are not in PAXRecords.
        for k, v := range h.Xattrs { //nolint:staticcheck // field deprecated in stdlib
                if _, ok := h.PAXRecords[paxSchilyXattr+k]; !ok {
                        xattrs = append(xattrs, [2]string{k, v})
                }
        }
        sort.Slice(xattrs, func(i, j int) bool { return xattrs[i][0] < xattrs[j][0] })

        // Make the slice with enough capacity to hold the 11 basic headers
        // we want from the v0 selector plus however many xattrs we have.
        orderedHeaders = make([][2]string, 0, 11+len(xattrs))

        // Copy all headers from v0 excluding the 'mtime' header (the 5th element).
        v0headers := v0TarHeaderSelect(h)
        orderedHeaders = append(orderedHeaders, v0headers[0:5]...)
        orderedHeaders = append(orderedHeaders, v0headers[6:]...)

        // Finally, append the sorted xattrs.
        orderedHeaders = append(orderedHeaders, xattrs...)

        return orderedHeaders
}

var registeredHeaderSelectors = map[Version]tarHeaderSelectFunc{
        Version0:   v0TarHeaderSelect,
        Version1:   v1TarHeaderSelect,
        VersionDev: v1TarHeaderSelect,
}

func getTarHeaderSelector(v Version) (tarHeaderSelector, error) {
        headerSelector, ok := registeredHeaderSelectors[v]
        if !ok {
                return nil, ErrVersionNotImplemented
        }

        return headerSelector, nil
}

package tarsum

import (
        "io"
)

type writeCloseFlusher interface {
        io.WriteCloser
        Flush() error
}

type nopCloseFlusher struct {
        io.Writer
}

func (n *nopCloseFlusher) Close() error {
        return nil
}

func (n *nopCloseFlusher) Flush() error {
        return nil
}

// Package useragent provides helper functions to pack
// version information into a single User-Agent header.
package useragent

import (
        "strings"
)

// VersionInfo is used to model UserAgent versions.
type VersionInfo struct {
        Name    string
        Version string
}

func (vi *VersionInfo) isValid() bool {
        const stopChars = " \t\r\n/"
        name := vi.Name
        vers := vi.Version
        if name == "" || strings.ContainsAny(name, stopChars) {
                return false
        }
        if vers == "" || strings.ContainsAny(vers, stopChars) {
                return false
        }
        return true
}

// AppendVersions converts versions to a string and appends the string to the string base.
//
// Each VersionInfo will be converted to a string in the format of
// "product/version", where the "product" is get from the name field, while
// version is get from the version field. Several pieces of version information
// will be concatenated and separated by space.
//
// Example:
// AppendVersions("base", VersionInfo{"foo", "1.0"}, VersionInfo{"bar", "2.0"})
// results in "base foo/1.0 bar/2.0".
func AppendVersions(base string, versions ...VersionInfo) string {
        if len(versions) == 0 {
                return base
        }

        verstrs := make([]string, 0, 1+len(versions))
        if base != "" {
                verstrs = append(verstrs, base)
        }

        for _, v := range versions {
                if !v.isValid() {
                        continue
                }
                verstrs = append(verstrs, v.Name+"/"+v.Version)
        }
        return strings.Join(verstrs, " ")
}

package seccomp

import (
        "github.com/moby/profiles/seccomp"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// DefaultProfile defines the allowed syscalls for the default seccomp profile.
//
// Deprecated: use [seccomp.DefaultProfile].
func DefaultProfile() *seccomp.Seccomp {
        return seccomp.DefaultProfile()
}

// GetDefaultProfile returns the default seccomp profile.
//
// Deprecated: use [seccomp.GetDefaultProfile].
func GetDefaultProfile(rs *specs.Spec) (*specs.LinuxSeccomp, error) {
        return seccomp.GetDefaultProfile(rs)
}

// LoadProfile takes a json string and decodes the seccomp profile.
//
// Deprecated: use [seccomp.LoadProfile].
func LoadProfile(body string, rs *specs.Spec) (*specs.LinuxSeccomp, error) {
        return seccomp.LoadProfile(body, rs)
}

package reference

type notFoundError string

func (e notFoundError) Error() string {
        return string(e)
}

func (notFoundError) NotFound() {}

type invalidTagError string

func (e invalidTagError) Error() string {
        return string(e)
}

func (invalidTagError) InvalidParameter() {}

type conflictingTagError string

func (e conflictingTagError) Error() string {
        return string(e)
}

func (conflictingTagError) Conflict() {}

package reference

import (
        "encoding/json"
        "fmt"
        "os"
        "path/filepath"
        "sort"
        "sync"

        "github.com/distribution/reference"
        "github.com/moby/sys/atomicwriter"
        "github.com/opencontainers/go-digest"
        "github.com/pkg/errors"
)

// ErrDoesNotExist is returned if a reference is not found in the
// store.
var ErrDoesNotExist notFoundError = "reference does not exist"

// An Association is a tuple associating a reference with an image ID.
type Association struct {
        Ref reference.Named
        ID  digest.Digest
}

// Store provides the set of methods which can operate on a reference store.
type Store interface {
        References(id digest.Digest) []reference.Named
        ReferencesByName(ref reference.Named) []Association
        AddTag(ref reference.Named, id digest.Digest, force bool) error
        AddDigest(ref reference.Canonical, id digest.Digest, force bool) error
        Delete(ref reference.Named) (bool, error)
        Get(ref reference.Named) (digest.Digest, error)
}

type refStore struct {
        mu sync.RWMutex
        // jsonPath is the path to the file where the serialized tag data is
        // stored.
        jsonPath string
        // Repositories is a map of repositories, indexed by name.
        Repositories map[string]repository
        // referencesByIDCache is a cache of references indexed by ID, to speed
        // up References.
        referencesByIDCache map[digest.Digest]map[string]reference.Named
}

// Repository maps tags to digests. The key is a stringified Reference,
// including the repository name.
type repository map[string]digest.Digest

type lexicalRefs []reference.Named

func (a lexicalRefs) Len() int      { return len(a) }
func (a lexicalRefs) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a lexicalRefs) Less(i, j int) bool {
        return a[i].String() < a[j].String()
}

type lexicalAssociations []Association

func (a lexicalAssociations) Len() int      { return len(a) }
func (a lexicalAssociations) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a lexicalAssociations) Less(i, j int) bool {
        return a[i].Ref.String() < a[j].Ref.String()
}

// NewReferenceStore creates a new reference store, tied to a file path where
// the set of references are serialized in JSON format.
func NewReferenceStore(jsonPath string) (Store, error) {
        abspath, err := filepath.Abs(jsonPath)
        if err != nil {
                return nil, err
        }

        store := &refStore{
                jsonPath:            abspath,
                Repositories:        make(map[string]repository),
                referencesByIDCache: make(map[digest.Digest]map[string]reference.Named),
        }
        // Load the json file if it exists, otherwise create it.
        if err := store.reload(); os.IsNotExist(err) {
                if err := store.save(); err != nil {
                        return nil, err
                }
        } else if err != nil {
                return nil, err
        }
        return store, nil
}

// AddTag adds a tag reference to the store. If force is set to true, existing
// references can be overwritten. This only works for tags, not digests.
func (store *refStore) AddTag(ref reference.Named, id digest.Digest, force bool) error {
        if _, isCanonical := ref.(reference.Canonical); isCanonical {
                return errors.WithStack(invalidTagError("refusing to create a tag with a digest reference"))
        }
        return store.addReference(reference.TagNameOnly(ref), id, force)
}

// AddDigest adds a digest reference to the store.
func (store *refStore) AddDigest(ref reference.Canonical, id digest.Digest, force bool) error {
        return store.addReference(ref, id, force)
}

func favorDigest(originalRef reference.Named) (reference.Named, error) {
        ref := originalRef
        // If the reference includes a digest and a tag, we must store only the
        // digest.
        canonical, isCanonical := originalRef.(reference.Canonical)
        _, isNamedTagged := originalRef.(reference.NamedTagged)

        if isCanonical && isNamedTagged {
                trimmed, err := reference.WithDigest(reference.TrimNamed(canonical), canonical.Digest())
                if err != nil {
                        // should never happen
                        return originalRef, err
                }
                ref = trimmed
        }
        return ref, nil
}

func (store *refStore) addReference(ref reference.Named, id digest.Digest, force bool) error {
        ref, err := favorDigest(ref)
        if err != nil {
                return err
        }

        refName := reference.FamiliarName(ref)
        refStr := reference.FamiliarString(ref)

        if refName == string(digest.Canonical) {
                return errors.WithStack(invalidTagError("refusing to create an ambiguous tag using digest algorithm as name"))
        }

        store.mu.Lock()
        defer store.mu.Unlock()

        repo, exists := store.Repositories[refName]
        if !exists || repo == nil {
                repo = make(map[string]digest.Digest)
                store.Repositories[refName] = repo
        }

        oldID, exists := repo[refStr]

        if exists {
                if oldID == id {
                        // Nothing to do. The caller may have checked for this using store.Get in advance, but store.mu was unlocked in the meantime, so this can legitimately happen nevertheless.
                        return nil
                }

                // force only works for tags
                if digested, isDigest := ref.(reference.Canonical); isDigest {
                        return errors.WithStack(conflictingTagError("cannot overwrite digest " + digested.Digest().String()))
                }

                if !force {
                        return errors.WithStack(
                                conflictingTagError(
                                        fmt.Sprintf("tag %s is already set to image %s, use the force option to replace it", refStr, oldID.String()),
                                ),
                        )
                }

                if store.referencesByIDCache[oldID] != nil {
                        delete(store.referencesByIDCache[oldID], refStr)
                        if len(store.referencesByIDCache[oldID]) == 0 {
                                delete(store.referencesByIDCache, oldID)
                        }
                }
        }

        repo[refStr] = id
        if store.referencesByIDCache[id] == nil {
                store.referencesByIDCache[id] = make(map[string]reference.Named)
        }
        store.referencesByIDCache[id][refStr] = ref

        return store.save()
}

// Delete deletes a reference from the store. It returns true if a deletion
// happened, or false otherwise.
func (store *refStore) Delete(ref reference.Named) (bool, error) {
        ref, err := favorDigest(ref)
        if err != nil {
                return false, err
        }

        ref = reference.TagNameOnly(ref)

        refName := reference.FamiliarName(ref)
        refStr := reference.FamiliarString(ref)

        store.mu.Lock()
        defer store.mu.Unlock()

        repo, exists := store.Repositories[refName]
        if !exists {
                return false, ErrDoesNotExist
        }

        if id, exists := repo[refStr]; exists {
                delete(repo, refStr)
                if len(repo) == 0 {
                        delete(store.Repositories, refName)
                }
                if store.referencesByIDCache[id] != nil {
                        delete(store.referencesByIDCache[id], refStr)
                        if len(store.referencesByIDCache[id]) == 0 {
                                delete(store.referencesByIDCache, id)
                        }
                }
                return true, store.save()
        }

        return false, ErrDoesNotExist
}

// Get retrieves an item from the store by reference
func (store *refStore) Get(ref reference.Named) (digest.Digest, error) {
        if canonical, ok := ref.(reference.Canonical); ok {
                // If reference contains both tag and digest, only
                // lookup by digest as it takes precedence over
                // tag, until tag/digest combos are stored.
                if _, ok := ref.(reference.Tagged); ok {
                        var err error
                        ref, err = reference.WithDigest(reference.TrimNamed(canonical), canonical.Digest())
                        if err != nil {
                                return "", err
                        }
                }
        } else {
                ref = reference.TagNameOnly(ref)
        }

        refName := reference.FamiliarName(ref)
        refStr := reference.FamiliarString(ref)

        store.mu.RLock()
        defer store.mu.RUnlock()

        repo, exists := store.Repositories[refName]
        if !exists || repo == nil {
                return "", ErrDoesNotExist
        }

        id, exists := repo[refStr]
        if !exists {
                return "", ErrDoesNotExist
        }

        return id, nil
}

// References returns a slice of references to the given ID. The slice
// will be nil if there are no references to this ID.
func (store *refStore) References(id digest.Digest) []reference.Named {
        store.mu.RLock()
        defer store.mu.RUnlock()

        // Convert the internal map to an array for two reasons:
        // 1) We must not return a mutable
        // 2) It would be ugly to expose the extraneous map keys to callers.

        var references []reference.Named
        for _, ref := range store.referencesByIDCache[id] {
                references = append(references, ref)
        }

        sort.Sort(lexicalRefs(references))

        return references
}

// ReferencesByName returns the references for a given repository name.
// If there are no references known for this repository name,
// ReferencesByName returns nil.
func (store *refStore) ReferencesByName(ref reference.Named) []Association {
        refName := reference.FamiliarName(ref)

        store.mu.RLock()
        defer store.mu.RUnlock()

        repo, exists := store.Repositories[refName]
        if !exists {
                return nil
        }

        var associations []Association
        for refStr, refID := range repo {
                ref, err := reference.ParseNormalizedNamed(refStr)
                if err != nil {
                        // Should never happen
                        return nil
                }
                associations = append(associations,
                        Association{
                                Ref: ref,
                                ID:  refID,
                        })
        }

        sort.Sort(lexicalAssociations(associations))

        return associations
}

func (store *refStore) save() error {
        // Store the json
        jsonData, err := json.Marshal(store)
        if err != nil {
                return err
        }
        return atomicwriter.WriteFile(store.jsonPath, jsonData, 0o600)
}

func (store *refStore) reload() error {
        f, err := os.Open(store.jsonPath)
        if err != nil {
                return err
        }
        defer f.Close()
        if err := json.NewDecoder(f).Decode(&store); err != nil {
                return err
        }

        for _, repo := range store.Repositories {
                for refStr, refID := range repo {
                        ref, err := reference.ParseNormalizedNamed(refStr)
                        if err != nil {
                                // Should never happen
                                continue
                        }
                        if store.referencesByIDCache[refID] == nil {
                                store.referencesByIDCache[refID] = make(map[string]reference.Named)
                        }
                        store.referencesByIDCache[refID][refStr] = ref
                }
        }

        return nil
}

package registry

import (
        "context"
        "net/http"
        "net/url"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/docker/distribution/registry/client/auth"
        "github.com/docker/distribution/registry/client/auth/challenge"
        "github.com/docker/distribution/registry/client/transport"
        "github.com/moby/moby/api/types/registry"
        "github.com/pkg/errors"
)

// AuthClientID is used the ClientID used for the token server
const AuthClientID = "docker"

type loginCredentialStore struct {
        authConfig *registry.AuthConfig
}

func (lcs loginCredentialStore) Basic(*url.URL) (string, string) {
        return lcs.authConfig.Username, lcs.authConfig.Password
}

func (lcs loginCredentialStore) RefreshToken(*url.URL, string) string {
        return lcs.authConfig.IdentityToken
}

func (lcs loginCredentialStore) SetRefreshToken(u *url.URL, service, token string) {
        lcs.authConfig.IdentityToken = token
}

type staticCredentialStore struct {
        auth *registry.AuthConfig
}

// NewStaticCredentialStore returns a credential store
// which always returns the same credential values.
func NewStaticCredentialStore(auth *registry.AuthConfig) auth.CredentialStore {
        return staticCredentialStore{
                auth: auth,
        }
}

func (scs staticCredentialStore) Basic(*url.URL) (string, string) {
        if scs.auth == nil {
                return "", ""
        }
        return scs.auth.Username, scs.auth.Password
}

func (scs staticCredentialStore) RefreshToken(*url.URL, string) string {
        if scs.auth == nil {
                return ""
        }
        return scs.auth.IdentityToken
}

func (scs staticCredentialStore) SetRefreshToken(*url.URL, string, string) {
}

// loginV2 tries to login to the v2 registry server. The given registry
// endpoint will be pinged to get authorization challenges. These challenges
// will be used to authenticate against the registry to validate credentials.
func loginV2(ctx context.Context, authConfig *registry.AuthConfig, endpoint APIEndpoint, userAgent string) (token string, _ error) {
        endpointStr := strings.TrimRight(endpoint.URL.String(), "/") + "/v2/"
        log.G(ctx).WithField("endpoint", endpointStr).Debug("attempting v2 login to registry endpoint")

        req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpointStr, http.NoBody)
        if err != nil {
                return "", err
        }

        var (
                modifiers            = Headers(userAgent, nil)
                authTrans            = transport.NewTransport(newTransport(endpoint.TLSConfig), modifiers...)
                credentialAuthConfig = *authConfig
                creds                = loginCredentialStore{authConfig: &credentialAuthConfig}
        )

        loginClient, err := v2AuthHTTPClient(endpoint.URL, authTrans, modifiers, creds, nil)
        if err != nil {
                return "", err
        }

        resp, err := loginClient.Do(req)
        if err != nil {
                err = translateV2AuthError(err)
                return "", err
        }
        defer resp.Body.Close()

        if resp.StatusCode != http.StatusOK {
                // TODO(dmcgowan): Attempt to further interpret result, status code and error code string
                return "", errors.Errorf("login attempt to %s failed with status: %d %s", endpointStr, resp.StatusCode, http.StatusText(resp.StatusCode))
        }

        return credentialAuthConfig.IdentityToken, nil
}

func v2AuthHTTPClient(endpoint *url.URL, authTransport http.RoundTripper, modifiers []transport.RequestModifier, creds auth.CredentialStore, scopes []auth.Scope) (*http.Client, error) {
        challengeManager, err := PingV2Registry(endpoint, authTransport)
        if err != nil {
                return nil, err
        }

        authHandlers := []auth.AuthenticationHandler{
                auth.NewTokenHandlerWithOptions(auth.TokenHandlerOptions{
                        Transport:     authTransport,
                        Credentials:   creds,
                        OfflineAccess: true,
                        ClientID:      AuthClientID,
                        Scopes:        scopes,
                }),
                auth.NewBasicHandler(creds),
        }

        modifiers = append(modifiers, auth.NewAuthorizer(challengeManager, authHandlers...))

        return &http.Client{
                Transport: transport.NewTransport(authTransport, modifiers...),
                Timeout:   15 * time.Second,
        }, nil
}

// ConvertToHostname normalizes a registry URL which has http|https prepended
// to just its hostname. It is used to match credentials, which may be either
// stored as hostname or as hostname including scheme (in legacy configuration
// files).
func ConvertToHostname(url string) string {
        stripped := url
        if strings.HasPrefix(stripped, "http://") {
                stripped = strings.TrimPrefix(stripped, "http://")
        } else if strings.HasPrefix(stripped, "https://") {
                stripped = strings.TrimPrefix(stripped, "https://")
        }
        stripped, _, _ = strings.Cut(stripped, "/")
        return stripped
}

// ResolveAuthConfig matches an auth configuration to a server address or a URL
func ResolveAuthConfig(authConfigs map[string]registry.AuthConfig, index *registry.IndexInfo) registry.AuthConfig {
        configKey := GetAuthConfigKey(index)
        // First try the happy case
        if c, found := authConfigs[configKey]; found || index.Official {
                return c
        }

        // Maybe they have a legacy config file, we will iterate the keys converting
        // them to the new format and testing
        for registryURL, ac := range authConfigs {
                if configKey == ConvertToHostname(registryURL) {
                        return ac
                }
        }

        // When all else fails, return an empty auth config
        return registry.AuthConfig{}
}

// PingResponseError is used when the response from a ping
// was received but invalid.
type PingResponseError struct {
        Err error
}

func (err PingResponseError) Error() string {
        return err.Err.Error()
}

// PingV2Registry attempts to ping a v2 registry and on success return a
// challenge manager for the supported authentication types.
// If a response is received but cannot be interpreted, a PingResponseError will be returned.
func PingV2Registry(endpoint *url.URL, transport http.RoundTripper) (challenge.Manager, error) {
        pingClient := &http.Client{
                Transport: transport,
                Timeout:   15 * time.Second,
        }
        endpointStr := strings.TrimRight(endpoint.String(), "/") + "/v2/"
        req, err := http.NewRequest(http.MethodGet, endpointStr, http.NoBody)
        if err != nil {
                return nil, err
        }
        resp, err := pingClient.Do(req)
        if err != nil {
                return nil, err
        }
        defer resp.Body.Close()

        challengeManager := challenge.NewSimpleManager()
        if err := challengeManager.AddResponse(resp); err != nil {
                return nil, PingResponseError{
                        Err: err,
                }
        }

        return challengeManager, nil
}

// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
//go:build go1.23

package registry

import (
        "context"
        "net"
        "net/url"
        "os"
        "path/filepath"
        "regexp"
        "runtime"
        "strconv"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/moby/moby/api/types/registry"
)

// ServiceOptions holds command line options.
type ServiceOptions struct {
        Mirrors            []string `json:"registry-mirrors,omitempty"`
        InsecureRegistries []string `json:"insecure-registries,omitempty"`
}

// serviceConfig holds daemon configuration for the registry service.
type serviceConfig registry.ServiceConfig

// TODO(thaJeztah) both the "index.docker.io" and "registry-1.docker.io" domains
// are here for historic reasons and backward-compatibility. These domains
// are still supported by Docker Hub (and will continue to be supported), but
// there are new domains already in use, and plans to consolidate all legacy
// domains to new "canonical" domains. Once those domains are decided on, we
// should update these consts (but making sure to preserve compatibility with
// existing installs, clients, and user configuration).
const (
        // DefaultNamespace is the default namespace
        DefaultNamespace = "docker.io"
        // DefaultRegistryHost is the hostname for the default (Docker Hub) registry
        // used for pushing and pulling images. This hostname is hard-coded to handle
        // the conversion from image references without registry name (e.g. "ubuntu",
        // or "ubuntu:latest"), as well as references using the "docker.io" domain
        // name, which is used as canonical reference for images on Docker Hub, but
        // does not match the domain-name of Docker Hub's registry.
        DefaultRegistryHost = "registry-1.docker.io"
        // IndexHostname is the index hostname, used for authentication and image search.
        IndexHostname = "index.docker.io"
        // IndexServer is used for user auth and image search
        IndexServer = "https://" + IndexHostname + "/v1/"
        // IndexName is the name of the index
        IndexName = "docker.io"
)

var (
        // DefaultV2Registry is the URI of the default (Docker Hub) registry.
        DefaultV2Registry = &url.URL{
                Scheme: "https",
                Host:   DefaultRegistryHost,
        }

        validHostPortRegex = sync.OnceValue(func() *regexp.Regexp {
                return regexp.MustCompile(`^` + reference.DomainRegexp.String() + `$`)
        })
)

// runningWithRootlessKit is a fork of [rootless.RunningWithRootlessKit],
// but inlining it to prevent adding that as a dependency for docker/cli.
//
// [rootless.RunningWithRootlessKit]: https://github.com/moby/moby/blob/b4bdf12daec84caaf809a639f923f7370d4926ad/pkg/rootless/rootless.go#L5-L8
func runningWithRootlessKit() bool {
        return runtime.GOOS == "linux" && os.Getenv("ROOTLESSKIT_STATE_DIR") != ""
}

// CertsDir is the directory where certificates are stored.
//
// - Linux: "/etc/docker/certs.d/"
// - Linux (with rootlessKit): $XDG_CONFIG_HOME/docker/certs.d/" or "$HOME/.config/docker/certs.d/"
// - Windows: "%PROGRAMDATA%/docker/certs.d/"
//
// TODO(thaJeztah): certsDir but stored in our config, and passed when needed. For the CLI, we should also default to same path as rootless.
func CertsDir() string {
        certsDir := "/etc/docker/certs.d"
        if runningWithRootlessKit() {
                if configHome, _ := os.UserConfigDir(); configHome != "" {
                        certsDir = filepath.Join(configHome, "docker", "certs.d")
                }
        } else if runtime.GOOS == "windows" {
                certsDir = filepath.Join(os.Getenv("programdata"), "docker", "certs.d")
        }
        return certsDir
}

// newServiceConfig returns a new instance of ServiceConfig
func newServiceConfig(options ServiceOptions) (*serviceConfig, error) {
        config := &serviceConfig{}
        if err := config.loadMirrors(options.Mirrors); err != nil {
                return nil, err
        }
        if err := config.loadInsecureRegistries(options.InsecureRegistries); err != nil {
                return nil, err
        }

        return config, nil
}

// copy constructs a new ServiceConfig with a copy of the configuration in config.
func (config *serviceConfig) copy() *registry.ServiceConfig {
        ic := make(map[string]*registry.IndexInfo)
        for key, value := range config.IndexConfigs {
                ic[key] = value
        }
        return &registry.ServiceConfig{
                InsecureRegistryCIDRs: append([]*registry.NetIPNet(nil), config.InsecureRegistryCIDRs...),
                IndexConfigs:          ic,
                Mirrors:               append([]string(nil), config.Mirrors...),
        }
}

// loadMirrors loads mirrors to config, after removing duplicates.
// Returns an error if mirrors contains an invalid mirror.
func (config *serviceConfig) loadMirrors(mirrors []string) error {
        mMap := map[string]struct{}{}
        unique := []string{}

        for _, mirror := range mirrors {
                m, err := ValidateMirror(mirror)
                if err != nil {
                        return err
                }
                if _, exist := mMap[m]; !exist {
                        mMap[m] = struct{}{}
                        unique = append(unique, m)
                }
        }

        config.Mirrors = unique

        // Configure public registry since mirrors may have changed.
        config.IndexConfigs = map[string]*registry.IndexInfo{
                IndexName: {
                        Name:     IndexName,
                        Mirrors:  unique,
                        Secure:   true,
                        Official: true,
                },
        }

        return nil
}

// loadInsecureRegistries loads insecure registries to config
func (config *serviceConfig) loadInsecureRegistries(registries []string) error {
        // Localhost is by default considered as an insecure registry. This is a
        // stop-gap for people who are running a private registry on localhost.
        registries = append(registries, "::1/128", "127.0.0.0/8")

        var (
                insecureRegistryCIDRs = make([]*registry.NetIPNet, 0)
                indexConfigs          = make(map[string]*registry.IndexInfo)
        )

skip:
        for _, r := range registries {
                // validate insecure registry
                if _, err := ValidateIndexName(r); err != nil {
                        return err
                }
                if strings.HasPrefix(strings.ToLower(r), "http://") {
                        log.G(context.TODO()).Warnf("insecure registry %s should not contain 'http://' and 'http://' has been removed from the insecure registry config", r)
                        r = r[7:]
                } else if strings.HasPrefix(strings.ToLower(r), "https://") {
                        log.G(context.TODO()).Warnf("insecure registry %s should not contain 'https://' and 'https://' has been removed from the insecure registry config", r)
                        r = r[8:]
                } else if hasScheme(r) {
                        return invalidParamf("insecure registry %s should not contain '://'", r)
                }
                // Check if CIDR was passed to --insecure-registry
                _, ipnet, err := net.ParseCIDR(r)
                if err == nil {
                        // Valid CIDR. If ipnet is already in config.InsecureRegistryCIDRs, skip.
                        data := (*registry.NetIPNet)(ipnet)
                        for _, value := range insecureRegistryCIDRs {
                                if value.IP.String() == data.IP.String() && value.Mask.String() == data.Mask.String() {
                                        continue skip
                                }
                        }
                        // ipnet is not found, add it in config.InsecureRegistryCIDRs
                        insecureRegistryCIDRs = append(insecureRegistryCIDRs, data)
                } else {
                        if err := validateHostPort(r); err != nil {
                                return invalidParamWrapf(err, "insecure registry %s is not valid", r)
                        }
                        // Assume `host:port` if not CIDR.
                        indexConfigs[r] = &registry.IndexInfo{
                                Name:     r,
                                Mirrors:  []string{},
                                Secure:   false,
                                Official: false,
                        }
                }
        }

        // Configure public registry.
        indexConfigs[IndexName] = &registry.IndexInfo{
                Name:     IndexName,
                Mirrors:  config.Mirrors,
                Secure:   true,
                Official: true,
        }
        config.InsecureRegistryCIDRs = insecureRegistryCIDRs
        config.IndexConfigs = indexConfigs

        return nil
}

// isSecureIndex returns false if the provided indexName is part of the list of insecure registries
// Insecure registries accept HTTP and/or accept HTTPS with certificates from unknown CAs.
//
// The list of insecure registries can contain an element with CIDR notation to specify a whole subnet.
// If the subnet contains one of the IPs of the registry specified by indexName, the latter is considered
// insecure.
//
// indexName should be a URL.Host (`host:port` or `host`) where the `host` part can be either a domain name
// or an IP address. If it is a domain name, then it will be resolved in order to check if the IP is contained
// in a subnet. If the resolving is not successful, isSecureIndex will only try to match hostname to any element
// of insecureRegistries.
func (config *serviceConfig) isSecureIndex(indexName string) bool {
        // Check for configured index, first.  This is needed in case isSecureIndex
        // is called from anything besides newIndexInfo, in order to honor per-index configurations.
        if index, ok := config.IndexConfigs[indexName]; ok {
                return index.Secure
        }

        return !isCIDRMatch(config.InsecureRegistryCIDRs, indexName)
}

// for mocking in unit tests.
var lookupIP = net.LookupIP

// isCIDRMatch returns true if URLHost matches an element of cidrs. URLHost is a URL.Host (`host:port` or `host`)
// where the `host` part can be either a domain name or an IP address. If it is a domain name, then it will be
// resolved to IP addresses for matching. If resolution fails, false is returned.
func isCIDRMatch(cidrs []*registry.NetIPNet, URLHost string) bool {
        if len(cidrs) == 0 {
                return false
        }

        host, _, err := net.SplitHostPort(URLHost)
        if err != nil {
                // Assume URLHost is a host without port and go on.
                host = URLHost
        }

        var addresses []net.IP
        if ip := net.ParseIP(host); ip != nil {
                // Host is an IP-address.
                addresses = append(addresses, ip)
        } else {
                // Try to resolve the host's IP-address.
                addresses, err = lookupIP(host)
                if err != nil {
                        // We failed to resolve the host; assume there's no match.
                        return false
                }
        }

        for _, addr := range addresses {
                for _, ipnet := range cidrs {
                        // check if the addr falls in the subnet
                        if (*net.IPNet)(ipnet).Contains(addr) {
                                return true
                        }
                }
        }

        return false
}

// ValidateMirror validates and normalizes an HTTP(S) registry mirror. It
// returns an error if the given mirrorURL is invalid, or the normalized
// format for the URL otherwise.
//
// It is used by the daemon to validate the daemon configuration.
func ValidateMirror(mirrorURL string) (string, error) {
        // Fast path for missing scheme, as url.Parse splits by ":", which can
        // cause the hostname to be considered the "scheme" when using "hostname:port".
        if scheme, _, ok := strings.Cut(mirrorURL, "://"); !ok || scheme == "" {
                return "", invalidParamf("invalid mirror: no scheme specified for %q: must use either 'https://' or 'http://'", mirrorURL)
        }
        uri, err := url.Parse(mirrorURL)
        if err != nil {
                return "", invalidParamWrapf(err, "invalid mirror: %q is not a valid URI", mirrorURL)
        }
        if uri.Scheme != "http" && uri.Scheme != "https" {
                return "", invalidParamf("invalid mirror: unsupported scheme %q in %q: must use either 'https://' or 'http://'", uri.Scheme, uri)
        }
        if uri.RawQuery != "" || uri.Fragment != "" {
                return "", invalidParamf("invalid mirror: query or fragment at end of the URI %q", uri)
        }
        if uri.User != nil {
                // strip password from output
                uri.User = url.UserPassword(uri.User.Username(), "xxxxx")
                return "", invalidParamf("invalid mirror: username/password not allowed in URI %q", uri)
        }
        return strings.TrimSuffix(mirrorURL, "/") + "/", nil
}

// ValidateIndexName validates an index name. It is used by the daemon to
// validate the daemon configuration.
func ValidateIndexName(val string) (string, error) {
        val = normalizeIndexName(val)
        if strings.HasPrefix(val, "-") || strings.HasSuffix(val, "-") {
                return "", invalidParamf("invalid index name (%s). Cannot begin or end with a hyphen", val)
        }
        return val, nil
}

func normalizeIndexName(val string) string {
        // TODO(thaJeztah): consider normalizing other known options, such as "(https://)registry-1.docker.io", "https://index.docker.io/v1/".
        // TODO: upstream this to check to reference package
        if val == "index.docker.io" {
                return "docker.io"
        }
        return val
}

func hasScheme(reposName string) bool {
        return strings.Contains(reposName, "://")
}

func validateHostPort(s string) error {
        // Split host and port, and in case s can not be split, assume host only
        host, port, err := net.SplitHostPort(s)
        if err != nil {
                host = s
                port = ""
        }
        // If match against the `host:port` pattern fails,
        // it might be `IPv6:port`, which will be captured by net.ParseIP(host)
        if !validHostPortRegex().MatchString(s) && net.ParseIP(host) == nil {
                return invalidParamf("invalid host %q", host)
        }
        if port != "" {
                v, err := strconv.Atoi(port)
                if err != nil {
                        return err
                }
                if v < 0 || v > 65535 {
                        return invalidParamf("invalid port %q", port)
                }
        }
        return nil
}

// newIndexInfo returns IndexInfo configuration from indexName
func newIndexInfo(config *serviceConfig, indexName string) *registry.IndexInfo {
        indexName = normalizeIndexName(indexName)

        // Return any configured index info, first.
        if index, ok := config.IndexConfigs[indexName]; ok {
                return index
        }

        // Construct a non-configured index info.
        return &registry.IndexInfo{
                Name:    indexName,
                Mirrors: []string{},
                Secure:  config.isSecureIndex(indexName),
        }
}

// GetAuthConfigKey special-cases using the full index address of the official
// index as the AuthConfig key, and uses the (host)name[:port] for private indexes.
func GetAuthConfigKey(index *registry.IndexInfo) string {
        if index.Official {
                return IndexServer
        }
        return index.Name
}

// ParseRepositoryInfo performs the breakdown of a repository name into a
// [RepositoryInfo], but lacks registry configuration.
//
// It is used by the Docker cli to interact with registry-related endpoints.
func ParseRepositoryInfo(reposName reference.Named) (*RepositoryInfo, error) {
        indexName := normalizeIndexName(reference.Domain(reposName))
        if indexName == IndexName {
                return &RepositoryInfo{
                        Name: reference.TrimNamed(reposName),
                        Index: &registry.IndexInfo{
                                Name:     IndexName,
                                Mirrors:  []string{},
                                Secure:   true,
                                Official: true,
                        },
                        Official: !strings.ContainsRune(reference.FamiliarName(reposName), '/'),
                }, nil
        }

        return &RepositoryInfo{
                Name: reference.TrimNamed(reposName),
                Index: &registry.IndexInfo{
                        Name:    indexName,
                        Mirrors: []string{},
                        Secure:  !isInsecure(indexName),
                },
        }, nil
}

// isInsecure is used to detect whether a registry domain or IP-address is allowed
// to use an insecure (non-TLS, or self-signed cert) connection according to the
// defaults, which allows for insecure connections with registries running on a
// loopback address ("localhost", "::1/128", "127.0.0.0/8").
//
// It is used in situations where we don't have access to the daemon's configuration,
// for example, when used from the client / CLI.
func isInsecure(hostNameOrIP string) bool {
        // Attempt to strip port if present; this also strips brackets for
        // IPv6 addresses with a port (e.g. "[::1]:5000").
        //
        // This is best-effort; we'll continue using the address as-is if it fails.
        if host, _, err := net.SplitHostPort(hostNameOrIP); err == nil {
                hostNameOrIP = host
        }
        if hostNameOrIP == "127.0.0.1" || hostNameOrIP == "::1" || strings.EqualFold(hostNameOrIP, "localhost") {
                // Fast path; no need to resolve these, assuming nobody overrides
                // "localhost" for anything else than a loopback address (sorry, not sorry).
                return true
        }

        var addresses []net.IP
        if ip := net.ParseIP(hostNameOrIP); ip != nil {
                addresses = append(addresses, ip)
        } else {
                // Try to resolve the host's IP-addresses.
                addrs, _ := lookupIP(hostNameOrIP)
                addresses = append(addresses, addrs...)
        }

        for _, addr := range addresses {
                if addr.IsLoopback() {
                        return true
                }
        }
        return false
}

package registry

import (
        "net/url"

        "github.com/docker/distribution/registry/api/errcode"
        "github.com/pkg/errors"
)

func translateV2AuthError(err error) error {
        switch e := err.(type) {
        case *url.Error:
                switch e2 := e.Err.(type) {
                case errcode.Error:
                        switch e2.Code {
                        case errcode.ErrorCodeUnauthorized:
                                return unauthorizedErr{err}
                        }
                }
        }

        return err
}

func invalidParam(err error) error {
        return invalidParameterErr{err}
}

func invalidParamf(format string, args ...interface{}) error {
        return invalidParameterErr{errors.Errorf(format, args...)}
}

func invalidParamWrapf(err error, format string, args ...interface{}) error {
        return invalidParameterErr{errors.Wrapf(err, format, args...)}
}

type unauthorizedErr struct{ error }

func (unauthorizedErr) Unauthorized() {}

func (e unauthorizedErr) Cause() error {
        return e.error
}

func (e unauthorizedErr) Unwrap() error {
        return e.error
}

type invalidParameterErr struct{ error }

func (invalidParameterErr) InvalidParameter() {}

func (e invalidParameterErr) Unwrap() error {
        return e.error
}

type systemErr struct{ error }

func (systemErr) System() {}

func (e systemErr) Unwrap() error {
        return e.error
}

type errUnknown struct{ error }

func (errUnknown) Unknown() {}

func (e errUnknown) Unwrap() error {
        return e.error
}

// Package registry contains client primitives to interact with a remote Docker registry.
package registry

import (
        "context"
        "crypto/tls"
        "net"
        "net/http"
        "os"
        "path/filepath"
        "runtime"
        "strings"
        "time"

        "github.com/containerd/log"
        "github.com/docker/distribution/registry/client/transport"
        "github.com/docker/go-connections/tlsconfig"
        "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
)

// hostCertsDir returns the config directory for a specific host.
func hostCertsDir(hostnameAndPort string) string {
        if runtime.GOOS == "windows" {
                // Ensure that a directory name is valid; hostnameAndPort may contain
                // a colon (:) if a port is included, and Windows does not allow colons
                // in directory names.
                hostnameAndPort = filepath.FromSlash(strings.ReplaceAll(hostnameAndPort, ":", ""))
        }
        return filepath.Join(CertsDir(), hostnameAndPort)
}

// newTLSConfig constructs a client TLS configuration based on server defaults
func newTLSConfig(ctx context.Context, hostname string, isSecure bool) (*tls.Config, error) {
        // PreferredServerCipherSuites should have no effect
        tlsConfig := tlsconfig.ServerDefault()
        tlsConfig.InsecureSkipVerify = !isSecure

        if isSecure {
                hostDir := hostCertsDir(hostname)
                log.G(ctx).Debugf("hostDir: %s", hostDir)
                if err := loadTLSConfig(ctx, hostDir, tlsConfig); err != nil {
                        return nil, err
                }
        }

        return tlsConfig, nil
}

func hasFile(files []os.DirEntry, name string) bool {
        for _, f := range files {
                if f.Name() == name {
                        return true
                }
        }
        return false
}

// ReadCertsDirectory reads the directory for TLS certificates
// including roots and certificate pairs and updates the
// provided TLS configuration.
func ReadCertsDirectory(tlsConfig *tls.Config, directory string) error {
        return loadTLSConfig(context.TODO(), directory, tlsConfig)
}

// loadTLSConfig reads the directory for TLS certificates including roots and
// certificate pairs, and updates the provided TLS configuration.
func loadTLSConfig(ctx context.Context, directory string, tlsConfig *tls.Config) error {
        fs, err := os.ReadDir(directory)
        if err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return invalidParam(err)
        }

        for _, f := range fs {
                if ctx.Err() != nil {
                        return ctx.Err()
                }
                switch filepath.Ext(f.Name()) {
                case ".crt":
                        if tlsConfig.RootCAs == nil {
                                systemPool, err := tlsconfig.SystemCertPool()
                                if err != nil {
                                        return invalidParamWrapf(err, "unable to get system cert pool")
                                }
                                tlsConfig.RootCAs = systemPool
                        }
                        fileName := filepath.Join(directory, f.Name())
                        log.G(ctx).Debugf("crt: %s", fileName)
                        data, err := os.ReadFile(fileName)
                        if err != nil {
                                return err
                        }
                        tlsConfig.RootCAs.AppendCertsFromPEM(data)
                case ".cert":
                        certName := f.Name()
                        keyName := certName[:len(certName)-5] + ".key"
                        log.G(ctx).Debugf("cert: %s", filepath.Join(directory, certName))
                        if !hasFile(fs, keyName) {
                                return invalidParamf("missing key %s for client certificate %s. CA certificates must use the extension .crt", keyName, certName)
                        }
                        cert, err := tls.LoadX509KeyPair(filepath.Join(directory, certName), filepath.Join(directory, keyName))
                        if err != nil {
                                return err
                        }
                        tlsConfig.Certificates = append(tlsConfig.Certificates, cert)
                case ".key":
                        keyName := f.Name()
                        certName := keyName[:len(keyName)-4] + ".cert"
                        log.G(ctx).Debugf("key: %s", filepath.Join(directory, keyName))
                        if !hasFile(fs, certName) {
                                return invalidParamf("missing client certificate %s for key %s", certName, keyName)
                        }
                }
        }

        return nil
}

// Headers returns request modifiers with a User-Agent and metaHeaders
func Headers(userAgent string, metaHeaders http.Header) []transport.RequestModifier {
        modifiers := []transport.RequestModifier{}
        if userAgent != "" {
                modifiers = append(modifiers, transport.NewHeaderRequestModifier(http.Header{
                        "User-Agent": []string{userAgent},
                }))
        }
        if metaHeaders != nil {
                modifiers = append(modifiers, transport.NewHeaderRequestModifier(metaHeaders))
        }
        return modifiers
}

// newTransport returns a new HTTP transport. If tlsConfig is nil, it uses the
// default TLS configuration.
func newTransport(tlsConfig *tls.Config) http.RoundTripper {
        if tlsConfig == nil {
                tlsConfig = tlsconfig.ServerDefault()
        }

        return otelhttp.NewTransport(
                &http.Transport{
                        Proxy: http.ProxyFromEnvironment,
                        DialContext: (&net.Dialer{
                                Timeout:   30 * time.Second,
                                KeepAlive: 30 * time.Second,
                        }).DialContext,
                        TLSHandshakeTimeout: 10 * time.Second,
                        TLSClientConfig:     tlsConfig,
                        // TODO(dmcgowan): Call close idle connections when complete and use keep alive
                        DisableKeepAlives: true,
                },
        )
}

package registry

import (
        "context"
        "net/http"
        "strconv"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/distribution/registry/client/auth"
        "github.com/moby/moby/api/types/filters"
        "github.com/moby/moby/api/types/registry"
        "github.com/pkg/errors"
)

var acceptedSearchFilterTags = map[string]bool{
        "is-automated": true, // Deprecated: the "is_automated" field is deprecated and will always be false in the future.
        "is-official":  true,
        "stars":        true,
}

// Search queries the public registry for repositories matching the specified
// search term and filters.
func (s *Service) Search(ctx context.Context, searchFilters filters.Args, term string, limit int, authConfig *registry.AuthConfig, headers map[string][]string) ([]registry.SearchResult, error) {
        if err := searchFilters.Validate(acceptedSearchFilterTags); err != nil {
                return nil, err
        }

        isAutomated, err := searchFilters.GetBoolOrDefault("is-automated", false)
        if err != nil {
                return nil, err
        }

        // "is-automated" is deprecated and filtering for `true` will yield no results.
        if isAutomated {
                return []registry.SearchResult{}, nil
        }

        isOfficial, err := searchFilters.GetBoolOrDefault("is-official", false)
        if err != nil {
                return nil, err
        }

        hasStarFilter := 0
        if searchFilters.Contains("stars") {
                hasStars := searchFilters.Get("stars")
                for _, hasStar := range hasStars {
                        iHasStar, err := strconv.Atoi(hasStar)
                        if err != nil {
                                return nil, invalidParameterErr{errors.Wrapf(err, "invalid filter 'stars=%s'", hasStar)}
                        }
                        if iHasStar > hasStarFilter {
                                hasStarFilter = iHasStar
                        }
                }
        }

        unfilteredResult, err := s.searchUnfiltered(ctx, term, limit, authConfig, headers)
        if err != nil {
                return nil, err
        }

        filteredResults := []registry.SearchResult{}
        for _, result := range unfilteredResult.Results {
                if searchFilters.Contains("is-official") {
                        if isOfficial != result.IsOfficial {
                                continue
                        }
                }
                if searchFilters.Contains("stars") {
                        if result.StarCount < hasStarFilter {
                                continue
                        }
                }
                // "is-automated" is deprecated and the value in Docker Hub search
                // results is untrustworthy. Force it to false so as to not mislead our
                // clients.
                result.IsAutomated = false //nolint:staticcheck  // ignore SA1019 (field is deprecated)
                filteredResults = append(filteredResults, result)
        }

        return filteredResults, nil
}

func (s *Service) searchUnfiltered(ctx context.Context, term string, limit int, authConfig *registry.AuthConfig, headers http.Header) (*registry.SearchResults, error) {
        if hasScheme(term) {
                return nil, invalidParamf("invalid repository name: repository name (%s) should not have a scheme", term)
        }

        indexName, remoteName := splitReposSearchTerm(term)

        // Search is a long-running operation, just lock s.config to avoid block others.
        s.mu.RLock()
        index := newIndexInfo(s.config, indexName)
        s.mu.RUnlock()
        if index.Official {
                // If pull "library/foo", it's stored locally under "foo"
                remoteName = strings.TrimPrefix(remoteName, "library/")
        }

        endpoint, err := newV1Endpoint(ctx, index, headers)
        if err != nil {
                return nil, err
        }

        var client *http.Client
        if authConfig != nil && authConfig.IdentityToken != "" && authConfig.Username != "" {
                creds := NewStaticCredentialStore(authConfig)

                // TODO(thaJeztah); is there a reason not to include other headers here? (originally added in 19d48f0b8ba59eea9f2cac4ad1c7977712a6b7ac)
                modifiers := Headers(headers.Get("User-Agent"), nil)
                v2Client, err := v2AuthHTTPClient(endpoint.URL, endpoint.client.Transport, modifiers, creds, []auth.Scope{
                        auth.RegistryScope{Name: "catalog", Actions: []string{"search"}},
                })
                if err != nil {
                        return nil, err
                }
                // Copy non transport http client features
                v2Client.Timeout = endpoint.client.Timeout
                v2Client.CheckRedirect = endpoint.client.CheckRedirect
                v2Client.Jar = endpoint.client.Jar

                log.G(ctx).Debugf("using v2 client for search to %s", endpoint.URL)
                client = v2Client
        } else {
                client = endpoint.client
                if err := authorizeClient(ctx, client, authConfig, endpoint); err != nil {
                        return nil, err
                }
        }

        return newSession(client, endpoint).searchRepositories(ctx, remoteName, limit)
}

// splitReposSearchTerm breaks a search term into an index name and remote name
func splitReposSearchTerm(reposName string) (string, string) {
        nameParts := strings.SplitN(reposName, "/", 2)
        if len(nameParts) == 1 || (!strings.Contains(nameParts[0], ".") &&
                !strings.Contains(nameParts[0], ":") && nameParts[0] != "localhost") {
                // This is a Docker Hub repository (ex: samalba/hipache or ubuntu),
                // use the default Docker Hub registry (docker.io)
                return IndexName, reposName
        }
        return nameParts[0], nameParts[1]
}

// ParseSearchIndexInfo will use repository name to get back an indexInfo.
//
// TODO(thaJeztah) this function is only used by the CLI, and used to get
// information of the registry (to provide credentials if needed). We should
// move this function (or equivalent) to the CLI, as it's doing too much just
// for that.
func ParseSearchIndexInfo(reposName string) (*registry.IndexInfo, error) {
        indexName, _ := splitReposSearchTerm(reposName)
        indexName = normalizeIndexName(indexName)
        if indexName == IndexName {
                return &registry.IndexInfo{
                        Name:     IndexName,
                        Mirrors:  []string{},
                        Secure:   true,
                        Official: true,
                }, nil
        }

        return &registry.IndexInfo{
                Name:    indexName,
                Mirrors: []string{},
                Secure:  !isInsecure(indexName),
        }, nil
}

package registry

import (
        "context"
        "crypto/tls"
        "encoding/json"
        "errors"
        "net/http"
        "net/url"
        "strings"

        "github.com/containerd/log"
        "github.com/docker/distribution/registry/client/transport"
        "github.com/moby/moby/api/types/registry"
)

// v1PingResult contains the information returned when pinging a registry. It
// indicates whether the registry claims to be a standalone registry.
type v1PingResult struct {
        // Standalone is set to true if the registry indicates it is a
        // standalone registry in the X-Docker-Registry-Standalone
        // header
        Standalone bool `json:"standalone"`
}

// v1Endpoint stores basic information about a V1 registry endpoint.
type v1Endpoint struct {
        client   *http.Client
        URL      *url.URL
        IsSecure bool
}

// newV1Endpoint parses the given address to return a registry endpoint.
// TODO: remove. This is only used by search.
func newV1Endpoint(ctx context.Context, index *registry.IndexInfo, headers http.Header) (*v1Endpoint, error) {
        tlsConfig, err := newTLSConfig(ctx, index.Name, index.Secure)
        if err != nil {
                return nil, err
        }

        endpoint, err := newV1EndpointFromStr(GetAuthConfigKey(index), tlsConfig, headers)
        if err != nil {
                return nil, err
        }

        if endpoint.String() == IndexServer {
                // Skip the check, we know this one is valid
                // (and we never want to fall back to http in case of error)
                return endpoint, nil
        }

        // Try HTTPS ping to registry
        endpoint.URL.Scheme = "https"
        if _, err := endpoint.ping(ctx); err != nil {
                if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                        return nil, err
                }
                if endpoint.IsSecure {
                        // If registry is secure and HTTPS failed, show user the error and tell them about `--insecure-registry`
                        // in case that's what they need. DO NOT accept unknown CA certificates, and DO NOT fall back to HTTP.
                        return nil, invalidParamf("invalid registry endpoint %s: %v. If this private registry supports only HTTP or HTTPS with an unknown CA certificate, please add `--insecure-registry %s` to the daemon's arguments. In the case of HTTPS, if you have access to the registry's CA certificate, no need for the flag; simply place the CA certificate at /etc/docker/certs.d/%s/ca.crt", endpoint, err, endpoint.URL.Host, endpoint.URL.Host)
                }

                // registry is insecure and HTTPS failed, fallback to HTTP.
                log.G(ctx).WithError(err).Debugf("error from registry %q marked as insecure - insecurely falling back to HTTP", endpoint)
                endpoint.URL.Scheme = "http"
                if _, err2 := endpoint.ping(ctx); err2 != nil {
                        return nil, invalidParamf("invalid registry endpoint %q. HTTPS attempt: %v. HTTP attempt: %v", endpoint, err, err2)
                }
        }

        return endpoint, nil
}

// trimV1Address trims the "v1" version suffix off the address and returns
// the trimmed address. It returns an error on "v2" endpoints.
func trimV1Address(address string) (string, error) {
        trimmed := strings.TrimSuffix(address, "/")
        if strings.HasSuffix(trimmed, "/v2") {
                return "", invalidParamf("search is not supported on v2 endpoints: %s", address)
        }
        return strings.TrimSuffix(trimmed, "/v1"), nil
}

func newV1EndpointFromStr(address string, tlsConfig *tls.Config, headers http.Header) (*v1Endpoint, error) {
        if !strings.HasPrefix(address, "http://") && !strings.HasPrefix(address, "https://") {
                address = "https://" + address
        }

        address, err := trimV1Address(address)
        if err != nil {
                return nil, err
        }

        uri, err := url.Parse(address)
        if err != nil {
                return nil, invalidParam(err)
        }

        // TODO(tiborvass): make sure a ConnectTimeout transport is used
        tr := newTransport(tlsConfig)

        return &v1Endpoint{
                IsSecure: tlsConfig == nil || !tlsConfig.InsecureSkipVerify,
                URL:      uri,
                client:   httpClient(transport.NewTransport(tr, Headers("", headers)...)),
        }, nil
}

// Get the formatted URL for the root of this registry Endpoint
func (e *v1Endpoint) String() string {
        return e.URL.String() + "/v1/"
}

// ping returns a v1PingResult which indicates whether the registry is standalone or not.
func (e *v1Endpoint) ping(ctx context.Context) (v1PingResult, error) {
        if e.String() == IndexServer {
                // Skip the check, we know this one is valid
                // (and we never want to fallback to http in case of error)
                return v1PingResult{}, nil
        }

        pingURL := e.String() + "_ping"
        log.G(ctx).WithField("url", pingURL).Debug("attempting v1 ping for registry endpoint")
        req, err := http.NewRequestWithContext(ctx, http.MethodGet, pingURL, http.NoBody)
        if err != nil {
                return v1PingResult{}, invalidParam(err)
        }

        resp, err := e.client.Do(req)
        if err != nil {
                if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                        return v1PingResult{}, err
                }
                return v1PingResult{}, invalidParam(err)
        }

        defer resp.Body.Close()

        if v := resp.Header.Get("X-Docker-Registry-Standalone"); v != "" {
                info := v1PingResult{}
                // Accepted values are "1", and "true" (case-insensitive).
                if v == "1" || strings.EqualFold(v, "true") {
                        info.Standalone = true
                }
                log.G(ctx).Debugf("v1PingResult.Standalone (from X-Docker-Registry-Standalone header): %t", info.Standalone)
                return info, nil
        }

        // If the header is absent, we assume true for compatibility with earlier
        // versions of the registry. default to true
        info := v1PingResult{
                Standalone: true,
        }
        if err := json.NewDecoder(resp.Body).Decode(&info); err != nil {
                log.G(ctx).WithError(err).Debug("error unmarshaling _ping response")
                // don't stop here. Just assume sane defaults
        }

        log.G(ctx).Debugf("v1PingResult.Standalone: %t", info.Standalone)
        return info, nil
}

// httpClient returns an HTTP client structure which uses the given transport
// and contains the necessary headers for redirected requests
func httpClient(transport http.RoundTripper) *http.Client {
        return &http.Client{
                Transport:     transport,
                CheckRedirect: addRequiredHeadersToRedirectedRequests,
        }
}

func trustedLocation(req *http.Request) bool {
        var (
                trusteds = []string{"docker.com", "docker.io"}
                hostname = strings.SplitN(req.Host, ":", 2)[0]
        )
        if req.URL.Scheme != "https" {
                return false
        }

        for _, trusted := range trusteds {
                if hostname == trusted || strings.HasSuffix(hostname, "."+trusted) {
                        return true
                }
        }
        return false
}

// addRequiredHeadersToRedirectedRequests adds the necessary redirection headers
// for redirected requests
func addRequiredHeadersToRedirectedRequests(req *http.Request, via []*http.Request) error {
        if len(via) != 0 && via[0] != nil {
                if trustedLocation(req) && trustedLocation(via[0]) {
                        req.Header = via[0].Header
                        return nil
                }
                for k, v := range via[0].Header {
                        if k != "Authorization" {
                                for _, vv := range v {
                                        req.Header.Add(k, vv)
                                }
                        }
                }
        }
        return nil
}

package registry

import (
        // this is required for some certificates
        "context"
        _ "crypto/sha512"
        "encoding/json"
        "fmt"
        "io"
        "net/http"
        "net/http/cookiejar"
        "net/url"
        "strings"
        "sync"

        "github.com/containerd/log"
        "github.com/moby/moby/api/types/registry"
        "github.com/pkg/errors"
)

// A session is used to communicate with a V1 registry
type session struct {
        indexEndpoint *v1Endpoint
        client        *http.Client
}

type authTransport struct {
        base       http.RoundTripper
        authConfig *registry.AuthConfig

        alwaysSetBasicAuth bool
        token              []string

        mu     sync.Mutex                      // guards modReq
        modReq map[*http.Request]*http.Request // original -> modified
}

// newAuthTransport handles the auth layer when communicating with a v1 registry (private or official)
//
// For private v1 registries, set alwaysSetBasicAuth to true.
//
// For the official v1 registry, if there isn't already an Authorization header in the request,
// but there is an X-Docker-Token header set to true, then Basic Auth will be used to set the Authorization header.
// After sending the request with the provided base http.RoundTripper, if an X-Docker-Token header, representing
// a token, is present in the response, then it gets cached and sent in the Authorization header of all subsequent
// requests.
//
// If the server sends a token without the client having requested it, it is ignored.
//
// This RoundTripper also has a CancelRequest method important for correct timeout handling.
func newAuthTransport(base http.RoundTripper, authConfig *registry.AuthConfig, alwaysSetBasicAuth bool) *authTransport {
        if base == nil {
                base = http.DefaultTransport
        }
        return &authTransport{
                base:               base,
                authConfig:         authConfig,
                alwaysSetBasicAuth: alwaysSetBasicAuth,
                modReq:             make(map[*http.Request]*http.Request),
        }
}

// cloneRequest returns a clone of the provided *http.Request.
// The clone is a shallow copy of the struct and its Header map.
func cloneRequest(r *http.Request) *http.Request {
        // shallow copy of the struct
        r2 := new(http.Request)
        *r2 = *r
        // deep copy of the Header
        r2.Header = make(http.Header, len(r.Header))
        for k, s := range r.Header {
                r2.Header[k] = append([]string(nil), s...)
        }

        return r2
}

// onEOFReader wraps an io.ReadCloser and a function
// the function will run at the end of file or close the file.
type onEOFReader struct {
        Rc io.ReadCloser
        Fn func()
}

func (r *onEOFReader) Read(p []byte) (int, error) {
        n, err := r.Rc.Read(p)
        if err == io.EOF {
                r.runFunc()
        }
        return n, err
}

// Close closes the file and run the function.
func (r *onEOFReader) Close() error {
        err := r.Rc.Close()
        r.runFunc()
        return err
}

func (r *onEOFReader) runFunc() {
        if fn := r.Fn; fn != nil {
                fn()
                r.Fn = nil
        }
}

// RoundTrip changes an HTTP request's headers to add the necessary
// authentication-related headers
func (tr *authTransport) RoundTrip(orig *http.Request) (*http.Response, error) {
        // Authorization should not be set on 302 redirect for untrusted locations.
        // This logic mirrors the behavior in addRequiredHeadersToRedirectedRequests.
        // As the authorization logic is currently implemented in RoundTrip,
        // a 302 redirect is detected by looking at the Referrer header as go http package adds said header.
        // This is safe as Docker doesn't set Referrer in other scenarios.
        if orig.Header.Get("Referer") != "" && !trustedLocation(orig) {
                return tr.base.RoundTrip(orig)
        }

        req := cloneRequest(orig)
        tr.mu.Lock()
        tr.modReq[orig] = req
        tr.mu.Unlock()

        if tr.alwaysSetBasicAuth {
                if tr.authConfig == nil {
                        return nil, errors.New("unexpected error: empty auth config")
                }
                req.SetBasicAuth(tr.authConfig.Username, tr.authConfig.Password)
                return tr.base.RoundTrip(req)
        }

        // Don't override
        if req.Header.Get("Authorization") == "" {
                if req.Header.Get("X-Docker-Token") == "true" && tr.authConfig != nil && tr.authConfig.Username != "" {
                        req.SetBasicAuth(tr.authConfig.Username, tr.authConfig.Password)
                } else if len(tr.token) > 0 {
                        req.Header.Set("Authorization", "Token "+strings.Join(tr.token, ","))
                }
        }
        resp, err := tr.base.RoundTrip(req)
        if err != nil {
                tr.mu.Lock()
                delete(tr.modReq, orig)
                tr.mu.Unlock()
                return nil, err
        }
        if len(resp.Header["X-Docker-Token"]) > 0 {
                tr.token = resp.Header["X-Docker-Token"]
        }
        resp.Body = &onEOFReader{
                Rc: resp.Body,
                Fn: func() {
                        tr.mu.Lock()
                        delete(tr.modReq, orig)
                        tr.mu.Unlock()
                },
        }
        return resp, nil
}

// CancelRequest cancels an in-flight request by closing its connection.
func (tr *authTransport) CancelRequest(req *http.Request) {
        type canceler interface {
                CancelRequest(*http.Request)
        }
        if cr, ok := tr.base.(canceler); ok {
                tr.mu.Lock()
                modReq := tr.modReq[req]
                delete(tr.modReq, req)
                tr.mu.Unlock()
                cr.CancelRequest(modReq)
        }
}

func authorizeClient(ctx context.Context, client *http.Client, authConfig *registry.AuthConfig, endpoint *v1Endpoint) error {
        var alwaysSetBasicAuth bool

        // If we're working with a standalone private registry over HTTPS, send Basic Auth headers
        // alongside all our requests.
        if endpoint.String() != IndexServer && endpoint.URL.Scheme == "https" {
                info, err := endpoint.ping(ctx)
                if err != nil {
                        return err
                }
                if info.Standalone && authConfig != nil {
                        log.G(ctx).WithField("endpoint", endpoint.String()).Debug("Endpoint is eligible for private registry; enabling alwaysSetBasicAuth")
                        alwaysSetBasicAuth = true
                }
        }

        // Annotate the transport unconditionally so that v2 can
        // properly fallback on v1 when an image is not found.
        client.Transport = newAuthTransport(client.Transport, authConfig, alwaysSetBasicAuth)

        jar, err := cookiejar.New(nil)
        if err != nil {
                return systemErr{errors.New("cookiejar.New is not supposed to return an error")}
        }
        client.Jar = jar

        return nil
}

func newSession(client *http.Client, endpoint *v1Endpoint) *session {
        return &session{
                client:        client,
                indexEndpoint: endpoint,
        }
}

// defaultSearchLimit is the default value for maximum number of returned search results.
const defaultSearchLimit = 25

// searchRepositories performs a search against the remote repository
func (r *session) searchRepositories(ctx context.Context, term string, limit int) (*registry.SearchResults, error) {
        if limit == 0 {
                limit = defaultSearchLimit
        }
        if limit < 1 || limit > 100 {
                return nil, invalidParamf("limit %d is outside the range of [1, 100]", limit)
        }
        u := r.indexEndpoint.String() + "search?q=" + url.QueryEscape(term) + "&n=" + url.QueryEscape(fmt.Sprintf("%d", limit))
        log.G(ctx).WithField("url", u).Debug("searchRepositories")

        req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, http.NoBody)
        if err != nil {
                return nil, invalidParamWrapf(err, "error building request")
        }
        // Have the AuthTransport send authentication, when logged in.
        req.Header.Set("X-Docker-Token", "true")
        res, err := r.client.Do(req)
        if err != nil {
                return nil, systemErr{err}
        }
        defer res.Body.Close()
        if res.StatusCode != http.StatusOK {
                // TODO(thaJeztah): return upstream response body for errors (see https://github.com/moby/moby/issues/27286).
                // TODO(thaJeztah): handle other status-codes to return correct error-type
                return nil, errUnknown{fmt.Errorf("Unexpected status code %d", res.StatusCode)}
        }
        result := &registry.SearchResults{}
        err = json.NewDecoder(res.Body).Decode(result)
        if err != nil {
                return nil, systemErr{errors.Wrap(err, "error decoding registry search results")}
        }
        return result, nil
}

package registry

import (
        "context"
        "crypto/tls"
        "errors"
        "net/url"
        "strings"
        "sync"

        cerrdefs "github.com/containerd/errdefs"
        "github.com/containerd/log"
        "github.com/distribution/reference"
        "github.com/moby/moby/api/types/registry"
)

// Service is a registry service. It tracks configuration data such as a list
// of mirrors.
type Service struct {
        config *serviceConfig
        mu     sync.RWMutex
}

// NewService returns a new instance of [Service] ready to be installed into
// an engine.
func NewService(options ServiceOptions) (*Service, error) {
        config, err := newServiceConfig(options)
        if err != nil {
                return nil, err
        }

        return &Service{config: config}, err
}

// ServiceConfig returns a copy of the public registry service's configuration.
func (s *Service) ServiceConfig() *registry.ServiceConfig {
        s.mu.RLock()
        defer s.mu.RUnlock()
        return s.config.copy()
}

// ReplaceConfig prepares a transaction which will atomically replace the
// registry service's configuration when the returned commit function is called.
func (s *Service) ReplaceConfig(options ServiceOptions) (commit func(), _ error) {
        config, err := newServiceConfig(options)
        if err != nil {
                return nil, err
        }
        return func() {
                s.mu.Lock()
                defer s.mu.Unlock()
                s.config = config
        }, nil
}

// Auth contacts the public registry with the provided credentials,
// and returns OK if authentication was successful.
// It can be used to verify the validity of a client's credentials.
func (s *Service) Auth(ctx context.Context, authConfig *registry.AuthConfig, userAgent string) (statusMessage, token string, _ error) {
        // TODO Use ctx when searching for repositories
        registryHostName := IndexHostname

        if authConfig.ServerAddress != "" {
                serverAddress := authConfig.ServerAddress
                if !strings.HasPrefix(serverAddress, "https://") && !strings.HasPrefix(serverAddress, "http://") {
                        serverAddress = "https://" + serverAddress
                }
                u, err := url.Parse(serverAddress)
                if err != nil {
                        return "", "", invalidParamWrapf(err, "unable to parse server address")
                }
                registryHostName = u.Host
        }

        // Lookup endpoints for authentication but exclude mirrors to prevent
        // sending credentials of the upstream registry to a mirror.
        s.mu.RLock()
        endpoints, err := s.lookupV2Endpoints(ctx, registryHostName, false)
        s.mu.RUnlock()
        if err != nil {
                if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
                        return "", "", err
                }
                return "", "", invalidParam(err)
        }

        var lastErr error
        for _, endpoint := range endpoints {
                authToken, err := loginV2(ctx, authConfig, endpoint, userAgent)
                if err != nil {
                        if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) || cerrdefs.IsUnauthorized(err) {
                                // Failed to authenticate; don't continue with (non-TLS) endpoints.
                                return "", "", err
                        }
                        // Try next endpoint
                        log.G(ctx).WithFields(log.Fields{
                                "error":    err,
                                "endpoint": endpoint,
                        }).Infof("Error logging in to endpoint, trying next endpoint")
                        lastErr = err
                        continue
                }

                // TODO(thaJeztah): move the statusMessage to the API endpoint; we don't need to produce that here?
                return "Login Succeeded", authToken, nil
        }

        return "", "", lastErr
}

// ResolveAuthConfig looks up authentication for the given reference from the
// given authConfigs.
//
// IMPORTANT: This function is for internal use and should not be used by external projects.
func (s *Service) ResolveAuthConfig(authConfigs map[string]registry.AuthConfig, ref reference.Named) registry.AuthConfig {
        s.mu.RLock()
        defer s.mu.RUnlock()
        // Simplified version of "newIndexInfo" without handling of insecure
        // registries and mirrors, as we don't need that information to resolve
        // the auth-config.
        indexName := normalizeIndexName(reference.Domain(ref))
        registryInfo, ok := s.config.IndexConfigs[indexName]
        if !ok {
                registryInfo = &registry.IndexInfo{Name: indexName}
        }
        return ResolveAuthConfig(authConfigs, registryInfo)
}

// APIEndpoint represents a remote API endpoint
type APIEndpoint struct {
        Mirror    bool
        URL       *url.URL
        TLSConfig *tls.Config
}

// LookupPullEndpoints creates a list of v2 endpoints to try to pull from, in order of preference.
// It gives preference to mirrors over the actual registry, and HTTPS over plain HTTP.
func (s *Service) LookupPullEndpoints(hostname string) ([]APIEndpoint, error) {
        s.mu.RLock()
        defer s.mu.RUnlock()

        return s.lookupV2Endpoints(context.TODO(), hostname, true)
}

// LookupPushEndpoints creates a list of v2 endpoints to try to push to, in order of preference.
// It gives preference to HTTPS over plain HTTP. Mirrors are not included.
func (s *Service) LookupPushEndpoints(hostname string) ([]APIEndpoint, error) {
        s.mu.RLock()
        defer s.mu.RUnlock()

        return s.lookupV2Endpoints(context.TODO(), hostname, false)
}

// IsInsecureRegistry returns true if the registry at given host is configured as
// insecure registry.
func (s *Service) IsInsecureRegistry(host string) bool {
        s.mu.RLock()
        defer s.mu.RUnlock()
        return !s.config.isSecureIndex(host)
}

package registry

import (
        "context"
        "net/url"
        "strings"

        "github.com/docker/go-connections/tlsconfig"
)

func (s *Service) lookupV2Endpoints(ctx context.Context, hostname string, includeMirrors bool) ([]APIEndpoint, error) {
        var endpoints []APIEndpoint
        if hostname == DefaultNamespace || hostname == IndexHostname {
                if includeMirrors {
                        for _, mirror := range s.config.Mirrors {
                                if ctx.Err() != nil {
                                        return nil, ctx.Err()
                                }
                                if !strings.HasPrefix(mirror, "http://") && !strings.HasPrefix(mirror, "https://") {
                                        mirror = "https://" + mirror
                                }
                                mirrorURL, err := url.Parse(mirror)
                                if err != nil {
                                        return nil, invalidParam(err)
                                }
                                // TODO(thaJeztah); this should all be memoized when loading the config. We're resolving mirrors and loading TLS config every time.
                                mirrorTLSConfig, err := newTLSConfig(ctx, mirrorURL.Host, s.config.isSecureIndex(mirrorURL.Host))
                                if err != nil {
                                        return nil, err
                                }
                                endpoints = append(endpoints, APIEndpoint{
                                        URL:       mirrorURL,
                                        Mirror:    true,
                                        TLSConfig: mirrorTLSConfig,
                                })
                        }
                }
                endpoints = append(endpoints, APIEndpoint{
                        URL:       DefaultV2Registry,
                        TLSConfig: tlsconfig.ServerDefault(),
                })

                return endpoints, nil
        }

        tlsConfig, err := newTLSConfig(ctx, hostname, s.config.isSecureIndex(hostname))
        if err != nil {
                return nil, err
        }

        endpoints = []APIEndpoint{
                {
                        URL: &url.URL{
                                Scheme: "https",
                                Host:   hostname,
                        },
                        TLSConfig: tlsConfig,
                },
        }

        if tlsConfig.InsecureSkipVerify {
                endpoints = append(endpoints, APIEndpoint{
                        URL: &url.URL{
                                Scheme: "http",
                                Host:   hostname,
                        },
                        // used to check if supposed to be secure via InsecureSkipVerify
                        TLSConfig: tlsConfig,
                })
        }

        return endpoints, nil
}