apparmor: Go Coverage Report

package apparmor

import (
        "errors"
        "fmt"
        "os"
        "sync"

        "github.com/opencontainers/runc/libcontainer/utils"
)

var (
        appArmorEnabled bool
        checkAppArmor   sync.Once
)

// isEnabled returns true if apparmor is enabled for the host.
func isEnabled() bool {
        checkAppArmor.Do(func() {
                if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
                        buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled")
                        appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y'
                }
        })
        return appArmorEnabled
}

func setProcAttr(attr, value string) error {
        attr = utils.CleanPath(attr)
        attrSubPath := "attr/apparmor/" + attr
        if _, err := os.Stat("/proc/self/" + attrSubPath); errors.Is(err, os.ErrNotExist) {
                // fall back to the old convention
                attrSubPath = "attr/" + attr
        }

        // Under AppArmor you can only change your own attr, so there's no reason
        // to not use /proc/thread-self/ (instead of /proc/<tid>/, like libapparmor
        // does).
        attrPath, closer := utils.ProcThreadSelf(attrSubPath)
        defer closer()

        f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
        if err != nil {
                return err
        }
        defer f.Close()

        if err := utils.EnsureProcHandle(f); err != nil {
                return err
        }

        _, err = f.WriteString(value)
        return err
}

// changeOnExec reimplements aa_change_onexec from libapparmor in Go
func changeOnExec(name string) error {
        if err := setProcAttr("exec", "exec "+name); err != nil {
                return fmt.Errorf("apparmor failed to apply profile: %w", err)
        }
        return nil
}

// applyProfile will apply the profile with the specified name to the process after
// the next exec. It is only supported on Linux and produces an error on other
// platforms.
func applyProfile(name string) error {
        if name == "" {
                return nil
        }

        return changeOnExec(name)
}

//go:build linux

package capabilities

import (
        "errors"
        "fmt"
        "sort"
        "strings"
        "sync"
        "syscall"

        "github.com/moby/sys/capability"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/sirupsen/logrus"
)

func capToStr(c capability.Cap) string {
        return "CAP_" + strings.ToUpper(c.String())
}

var capMap = sync.OnceValues(func() (map[string]capability.Cap, error) {
        list, err := capability.ListSupported()
        if err != nil {
                return nil, err
        }
        cm := make(map[string]capability.Cap, len(list))
        for _, c := range list {
                cm[capToStr(c)] = c
        }
        return cm, nil
})

// KnownCapabilities returns the list of the known capabilities.
// Used by `runc features`.
func KnownCapabilities() []string {
        list := capability.ListKnown()
        res := make([]string, len(list))
        for i, c := range list {
                res[i] = "CAP_" + strings.ToUpper(c.String())
        }
        return res
}

// New creates a new Caps from the given Capabilities config. Unknown Capabilities
// or Capabilities that are unavailable in the current environment are ignored,
// printing a warning instead.
func New(capConfig *configs.Capabilities) (*Caps, error) {
        var c Caps
        if capConfig == nil {
                return &c, nil
        }

        _, err := capMap()
        if err != nil {
                return nil, err
        }
        unknownCaps := make(map[string]struct{})
        c.caps = map[capability.CapType][]capability.Cap{
                capability.BOUNDING:    capSlice(capConfig.Bounding, unknownCaps),
                capability.EFFECTIVE:   capSlice(capConfig.Effective, unknownCaps),
                capability.INHERITABLE: capSlice(capConfig.Inheritable, unknownCaps),
                capability.PERMITTED:   capSlice(capConfig.Permitted, unknownCaps),
                capability.AMBIENT:     capSlice(capConfig.Ambient, unknownCaps),
        }
        if c.pid, err = capability.NewPid2(0); err != nil {
                return nil, err
        }
        if len(unknownCaps) > 0 {
                logrus.Warn("ignoring unknown or unavailable capabilities: ", mapKeys(unknownCaps))
        }
        return &c, nil
}

// capSlice converts the slice of capability names in caps, to their numeric
// equivalent, and returns them as a slice. Unknown or unavailable capabilities
// are not returned, but appended to unknownCaps.
func capSlice(caps []string, unknownCaps map[string]struct{}) []capability.Cap {
        cm, _ := capMap()
        out := make([]capability.Cap, 0, len(caps))
        for _, c := range caps {
                if v, ok := cm[c]; !ok {
                        unknownCaps[c] = struct{}{}
                } else {
                        out = append(out, v)
                }
        }
        return out
}

// mapKeys returns the keys of input in sorted order
func mapKeys(input map[string]struct{}) []string {
        keys := make([]string, 0, len(input))
        for c := range input {
                keys = append(keys, c)
        }
        sort.Strings(keys)
        return keys
}

// Caps holds the capabilities for a container.
type Caps struct {
        pid  capability.Capabilities
        caps map[capability.CapType][]capability.Cap
}

// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (c *Caps) ApplyBoundingSet() error {
        if c.pid == nil {
                return nil
        }
        c.pid.Clear(capability.BOUNDING)
        c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...)
        return c.pid.Apply(capability.BOUNDING)
}

// Apply sets all the capabilities for the current process in the config.
func (c *Caps) ApplyCaps() error {
        if c.pid == nil {
                return nil
        }
        c.pid.Clear(capability.CAPS | capability.BOUNDS)
        for _, g := range []capability.CapType{
                capability.EFFECTIVE,
                capability.PERMITTED,
                capability.INHERITABLE,
                capability.BOUNDING,
        } {
                c.pid.Set(g, c.caps[g]...)
        }
        if err := c.pid.Apply(capability.CAPS | capability.BOUNDS); err != nil {
                return fmt.Errorf("can't apply capabilities: %w", err)
        }

        // Old version of capability package used to ignore errors from setting
        // ambient capabilities, which is now fixed (see
        // https://github.com/kolyshkin/capability/pull/3).
        //
        // To maintain backward compatibility, set ambient caps one by one and
        // don't return any errors, only warn.
        ambs := c.caps[capability.AMBIENT]
        err := capability.ResetAmbient()

        // EINVAL is returned when the kernel doesn't support ambient capabilities.
        // We ignore this because runc supports running on older kernels.
        if err != nil && !errors.Is(err, syscall.EINVAL) {
                return err
        }

        for _, a := range ambs {
                err := capability.SetAmbient(true, a)
                if err != nil {
                        logrus.Warnf("can't raise ambient capability %s: %v", capToStr(a), err)
                }
        }

        return nil
}

package cgroups

import "fmt"

// BlockIODevice holds major:minor format supported in blkio cgroup.
type BlockIODevice struct {
        // Major is the device's major number
        Major int64 `json:"major"`
        // Minor is the device's minor number
        Minor int64 `json:"minor"`
}

// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
type WeightDevice struct {
        BlockIODevice
        // Weight is the bandwidth rate for the device, range is from 10 to 1000
        Weight uint16 `json:"weight"`
        // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
        LeafWeight uint16 `json:"leafWeight"`
}

// NewWeightDevice returns a configured WeightDevice pointer
func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
        wd := &WeightDevice{}
        wd.Major = major
        wd.Minor = minor
        wd.Weight = weight
        wd.LeafWeight = leafWeight
        return wd
}

// WeightString formats the struct to be writable to the cgroup specific file
func (wd *WeightDevice) WeightString() string {
        return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
}

// LeafWeightString formats the struct to be writable to the cgroup specific file
func (wd *WeightDevice) LeafWeightString() string {
        return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
}

// ThrottleDevice struct holds a `major:minor rate_per_second` pair
type ThrottleDevice struct {
        BlockIODevice
        // Rate is the IO rate limit per cgroup per device
        Rate uint64 `json:"rate"`
}

// NewThrottleDevice returns a configured ThrottleDevice pointer
func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
        td := &ThrottleDevice{}
        td.Major = major
        td.Minor = minor
        td.Rate = rate
        return td
}

// String formats the struct to be writable to the cgroup specific file
func (td *ThrottleDevice) String() string {
        return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
}

// StringName formats the struct to be writable to the cgroup specific file
func (td *ThrottleDevice) StringName(name string) string {
        return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
}

package cgroups

import (
        "fmt"
)

type IfPrioMap struct {
        Interface string `json:"interface"`
        Priority  int64  `json:"priority"`
}

func (i *IfPrioMap) CgroupString() string {
        return fmt.Sprintf("%s %d", i.Interface, i.Priority)
}

package config

import (
        "fmt"
        "os"
        "strconv"
)

const (
        Wildcard = -1
)

type Device struct {
        Rule

        // Path to the device.
        Path string `json:"path"`

        // FileMode permission bits for the device.
        FileMode os.FileMode `json:"file_mode"`

        // Uid of the device.
        Uid uint32 `json:"uid"`

        // Gid of the device.
        Gid uint32 `json:"gid"`
}

// Permissions is a cgroupv1-style string to represent device access. It
// has to be a string for backward compatibility reasons, hence why it has
// methods to do set operations.
type Permissions string

const (
        deviceRead uint = (1 << iota)
        deviceWrite
        deviceMknod
)

func (p Permissions) toSet() uint {
        var set uint
        for _, perm := range p {
                switch perm {
                case 'r':
                        set |= deviceRead
                case 'w':
                        set |= deviceWrite
                case 'm':
                        set |= deviceMknod
                }
        }
        return set
}

func fromSet(set uint) Permissions {
        var perm string
        if set&deviceRead == deviceRead {
                perm += "r"
        }
        if set&deviceWrite == deviceWrite {
                perm += "w"
        }
        if set&deviceMknod == deviceMknod {
                perm += "m"
        }
        return Permissions(perm)
}

// Union returns the union of the two sets of Permissions.
func (p Permissions) Union(o Permissions) Permissions {
        lhs := p.toSet()
        rhs := o.toSet()
        return fromSet(lhs | rhs)
}

// Difference returns the set difference of the two sets of Permissions.
// In set notation, A.Difference(B) gives you A\B.
func (p Permissions) Difference(o Permissions) Permissions {
        lhs := p.toSet()
        rhs := o.toSet()
        return fromSet(lhs &^ rhs)
}

// Intersection computes the intersection of the two sets of Permissions.
func (p Permissions) Intersection(o Permissions) Permissions {
        lhs := p.toSet()
        rhs := o.toSet()
        return fromSet(lhs & rhs)
}

// IsEmpty returns whether the set of permissions in a Permissions is
// empty.
func (p Permissions) IsEmpty() bool {
        return p == Permissions("")
}

// IsValid returns whether the set of permissions is a subset of valid
// permissions (namely, {r,w,m}).
func (p Permissions) IsValid() bool {
        return p == fromSet(p.toSet())
}

type Type rune

const (
        WildcardDevice Type = 'a'
        BlockDevice    Type = 'b'
        CharDevice     Type = 'c' // or 'u'
        FifoDevice     Type = 'p'
)

func (t Type) IsValid() bool {
        switch t {
        case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
                return true
        default:
                return false
        }
}

func (t Type) CanMknod() bool {
        switch t {
        case BlockDevice, CharDevice, FifoDevice:
                return true
        default:
                return false
        }
}

func (t Type) CanCgroup() bool {
        switch t {
        case WildcardDevice, BlockDevice, CharDevice:
                return true
        default:
                return false
        }
}

type Rule struct {
        // Type of device ('c' for char, 'b' for block). If set to 'a', this rule
        // acts as a wildcard and all fields other than Allow are ignored.
        Type Type `json:"type"`

        // Major is the device's major number.
        Major int64 `json:"major"`

        // Minor is the device's minor number.
        Minor int64 `json:"minor"`

        // Permissions is the set of permissions that this rule applies to (in the
        // cgroupv1 format -- any combination of "rwm").
        Permissions Permissions `json:"permissions"`

        // Allow specifies whether this rule is allowed.
        Allow bool `json:"allow"`
}

func (d *Rule) CgroupString() string {
        var (
                major = strconv.FormatInt(d.Major, 10)
                minor = strconv.FormatInt(d.Minor, 10)
        )
        if d.Major == Wildcard {
                major = "*"
        }
        if d.Minor == Wildcard {
                minor = "*"
        }
        return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
}

func (d *Rule) Mkdev() (uint64, error) {
        return mkDev(d)
}

package config

import (
        "errors"

        "golang.org/x/sys/unix"
)

func mkDev(d *Rule) (uint64, error) {
        if d.Major == Wildcard || d.Minor == Wildcard {
                return 0, errors.New("cannot mkdev() device with wildcards")
        }
        return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
}

// Implements creation of eBPF device filter program.
//
// Based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
package devices

import (
        "errors"
        "fmt"
        "math"
        "strconv"

        "github.com/cilium/ebpf/asm"
        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
        "golang.org/x/sys/unix"
)

const (
        // license string format is same as kernel MODULE_LICENSE macro
        license = "Apache"
)

// deviceFilter returns eBPF device filter program and its license string.
func deviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
        // Generate the minimum ruleset for the device rules we are given. While we
        // don't care about minimum transitions in cgroupv2, using the emulator
        // gives us a guarantee that the behaviour of devices filtering is the same
        // as cgroupv1, including security hardenings to avoid misconfiguration
        // (such as punching holes in wildcard rules).
        emu := new(emulator)
        for _, rule := range rules {
                if err := emu.Apply(*rule); err != nil {
                        return nil, "", err
                }
        }
        cleanRules, err := emu.Rules()
        if err != nil {
                return nil, "", err
        }

        p := &program{
                defaultAllow: emu.IsBlacklist(),
        }
        p.init()

        for idx, rule := range cleanRules {
                if rule.Type == devices.WildcardDevice {
                        // We can safely skip over wildcard entries because there should
                        // only be one (at most) at the very start to instruct cgroupv1 to
                        // go into allow-list mode. However we do double-check this here.
                        if idx != 0 || rule.Allow != emu.IsBlacklist() {
                                return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
                        }
                        continue
                }
                if rule.Allow == p.defaultAllow {
                        // There should be no rules which have an action equal to the
                        // default action, the emulator removes those.
                        return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
                }
                if err := p.appendRule(rule); err != nil {
                        return nil, "", err
                }
        }
        return p.finalize(), license, nil
}

type program struct {
        insts        asm.Instructions
        defaultAllow bool
        blockID      int
}

func (p *program) init() {
        // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
        /*
                u32 access_type
                u32 major
                u32 minor
        */
        // R2 <- type (lower 16 bit of u32 access_type at R1[0])
        p.insts = append(p.insts,
                asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
                asm.And.Imm32(asm.R2, 0xFFFF))

        // R3 <- access (upper 16 bit of u32 access_type at R1[0])
        p.insts = append(p.insts,
                asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
                // RSh: bitwise shift right
                asm.RSh.Imm32(asm.R3, 16))

        // R4 <- major (u32 major at R1[4])
        p.insts = append(p.insts,
                asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))

        // R5 <- minor (u32 minor at R1[8])
        p.insts = append(p.insts,
                asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}

// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
        if p.blockID < 0 {
                return errors.New("the program is finalized")
        }

        var bpfType int32
        switch rule.Type {
        case devices.CharDevice:
                bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
        case devices.BlockDevice:
                bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
        default:
                // We do not permit 'a', nor any other types we don't know about.
                return fmt.Errorf("invalid type %q", string(rule.Type))
        }
        if rule.Major > math.MaxUint32 {
                return fmt.Errorf("invalid major %d", rule.Major)
        }
        if rule.Minor > math.MaxUint32 {
                return fmt.Errorf("invalid minor %d", rule.Major)
        }
        hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
        hasMinor := rule.Minor >= 0
        bpfAccess := int32(0)
        for _, r := range rule.Permissions {
                switch r {
                case 'r':
                        bpfAccess |= unix.BPF_DEVCG_ACC_READ
                case 'w':
                        bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
                case 'm':
                        bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
                default:
                        return fmt.Errorf("unknown device access %v", r)
                }
        }
        // If the access is rwm, skip the check.
        hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)

        var (
                blockSym         = "block-" + strconv.Itoa(p.blockID)
                nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
                prevBlockLastIdx = len(p.insts) - 1
        )
        p.insts = append(p.insts,
                // if (R2 != bpfType) goto next
                asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
        )
        if hasAccess {
                p.insts = append(p.insts,
                        // if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
                        asm.Mov.Reg32(asm.R1, asm.R3),
                        asm.And.Imm32(asm.R1, bpfAccess),
                        asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
                )
        }
        if hasMajor {
                p.insts = append(p.insts,
                        // if (R4 != major) goto next
                        asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
                )
        }
        if hasMinor {
                p.insts = append(p.insts,
                        // if (R5 != minor) goto next
                        asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
                )
        }
        p.insts = append(p.insts, acceptBlock(rule.Allow)...)
        // set blockSym to the first instruction we added in this iteration
        p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].WithSymbol(blockSym)
        p.blockID++
        return nil
}

func (p *program) finalize() asm.Instructions {
        var v int32
        if p.defaultAllow {
                v = 1
        }
        blockSym := "block-" + strconv.Itoa(p.blockID)
        p.insts = append(p.insts,
                // R0 <- v
                asm.Mov.Imm32(asm.R0, v).WithSymbol(blockSym),
                asm.Return(),
        )
        p.blockID = -1
        return p.insts
}

func acceptBlock(accept bool) asm.Instructions {
        var v int32
        if accept {
                v = 1
        }
        return []asm.Instruction{
                // R0 <- v
                asm.Mov.Imm32(asm.R0, v),
                asm.Return(),
        }
}

// Package devices contains functionality to manage cgroup devices, which
// is exposed indirectly via libcontainer/cgroups managers.
//
// To enable cgroup managers to manage devices, this package must be imported.
package devices

import (
        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
)

func init() {
        cgroups.DevicesSetV1 = setV1
        cgroups.DevicesSetV2 = setV2
        systemd.GenerateDeviceProps = systemdProperties
}

// SPDX-License-Identifier: Apache-2.0
/*
 * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
 * Copyright (C) 2020 SUSE LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package devices

import (
        "bufio"
        "fmt"
        "io"
        "sort"
        "strconv"
        "strings"

        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
)

// deviceMeta is a Rule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
        node  devices.Type
        major int64
        minor int64
}

// deviceRule is effectively the tuple (deviceMeta, Permissions).
type deviceRule struct {
        meta  deviceMeta
        perms devices.Permissions
}

// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]devices.Permissions

func (r deviceRules) orderedEntries() []deviceRule {
        var rules []deviceRule
        for meta, perms := range r {
                rules = append(rules, deviceRule{meta: meta, perms: perms})
        }
        sort.Slice(rules, func(i, j int) bool {
                // Sort by (major, minor, type).
                a, b := rules[i].meta, rules[j].meta
                return a.major < b.major ||
                        (a.major == b.major && a.minor < b.minor) ||
                        (a.major == b.major && a.minor == b.minor && a.node < b.node)
        })
        return rules
}

type emulator struct {
        defaultAllow bool
        rules        deviceRules
}

func (e *emulator) IsBlacklist() bool {
        return e.defaultAllow
}

func (e *emulator) IsAllowAll() bool {
        return e.IsBlacklist() && len(e.rules) == 0
}

func parseLine(line string) (*deviceRule, error) {
        // Input: node major:minor perms.
        fields := strings.FieldsFunc(line, func(r rune) bool {
                return r == ' ' || r == ':'
        })
        if len(fields) != 4 {
                return nil, fmt.Errorf("malformed devices.list rule %s", line)
        }

        var (
                rule  deviceRule
                node  = fields[0]
                major = fields[1]
                minor = fields[2]
                perms = fields[3]
        )

        // Parse the node type.
        switch node {
        case "a":
                // Super-special case -- "a" always means every device with every
                // access mode. In fact, for devices.list this actually indicates that
                // the cgroup is in black-list mode.
                // TODO: Double-check that the entire file is "a *:* rwm".
                return nil, nil
        case "b":
                rule.meta.node = devices.BlockDevice
        case "c":
                rule.meta.node = devices.CharDevice
        default:
                return nil, fmt.Errorf("unknown device type %q", node)
        }

        // Parse the major number.
        if major == "*" {
                rule.meta.major = devices.Wildcard
        } else {
                val, err := strconv.ParseUint(major, 10, 32)
                if err != nil {
                        return nil, fmt.Errorf("invalid major number: %w", err)
                }
                rule.meta.major = int64(val)
        }

        // Parse the minor number.
        if minor == "*" {
                rule.meta.minor = devices.Wildcard
        } else {
                val, err := strconv.ParseUint(minor, 10, 32)
                if err != nil {
                        return nil, fmt.Errorf("invalid minor number: %w", err)
                }
                rule.meta.minor = int64(val)
        }

        // Parse the access permissions.
        rule.perms = devices.Permissions(perms)
        if !rule.perms.IsValid() || rule.perms.IsEmpty() {
                return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
        }
        return &rule, nil
}

func (e *emulator) addRule(rule deviceRule) error { //nolint:unparam
        if e.rules == nil {
                e.rules = make(map[deviceMeta]devices.Permissions)
        }

        // Merge with any pre-existing permissions.
        oldPerms := e.rules[rule.meta]
        newPerms := rule.perms.Union(oldPerms)
        e.rules[rule.meta] = newPerms
        return nil
}

func (e *emulator) rmRule(rule deviceRule) error {
        // Give an error if any of the permissions requested to be removed are
        // present in a partially-matching wildcard rule, because such rules will
        // be ignored by cgroupv1.
        //
        // This is a diversion from cgroupv1, but is necessary to avoid leading
        // users into a false sense of security. cgroupv1 will silently(!) ignore
        // requests to remove partial exceptions, but we really shouldn't do that.
        //
        // It may seem like we could just "split" wildcard rules which hit this
        // issue, but unfortunately there are 2^32 possible major and minor
        // numbers, which would exhaust kernel memory quickly if we did this. Not
        // to mention it'd be really slow (the kernel side is implemented as a
        // linked-list of exceptions).
        for _, partialMeta := range []deviceMeta{
                {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
                {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
                {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
        } {
                // This wildcard rule is equivalent to the requested rule, so skip it.
                if rule.meta == partialMeta {
                        continue
                }
                // Only give an error if the set of permissions overlap.
                partialPerms := e.rules[partialMeta]
                if !partialPerms.Intersection(rule.perms).IsEmpty() {
                        return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
                }
        }

        // Subtract all of the permissions listed from the full match rule. If the
        // rule didn't exist, all of this is a no-op.
        newPerms := e.rules[rule.meta].Difference(rule.perms)
        if newPerms.IsEmpty() {
                delete(e.rules, rule.meta)
        } else {
                e.rules[rule.meta] = newPerms
        }
        // TODO: The actual cgroup code doesn't care if an exception didn't exist
        //       during removal, so not erroring out here is /accurate/ but quite
        //       worrying. Maybe we should do additional validation, but again we
        //       have to worry about backwards-compatibility.
        return nil
}

func (e *emulator) allow(rule *deviceRule) error {
        // This cgroup is configured as a black-list. Reset the entire emulator,
        // and put is into black-list mode.
        if rule == nil || rule.meta.node == devices.WildcardDevice {
                *e = emulator{
                        defaultAllow: true,
                        rules:        nil,
                }
                return nil
        }

        var err error
        if e.defaultAllow {
                err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
        } else {
                err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
        }
        return err
}

func (e *emulator) deny(rule *deviceRule) error {
        // This cgroup is configured as a white-list. Reset the entire emulator,
        // and put is into white-list mode.
        if rule == nil || rule.meta.node == devices.WildcardDevice {
                *e = emulator{
                        defaultAllow: false,
                        rules:        nil,
                }
                return nil
        }

        var err error
        if e.defaultAllow {
                err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
        } else {
                err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
        }
        return err
}

func (e *emulator) Apply(rule devices.Rule) error {
        if !rule.Type.CanCgroup() {
                return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
        }

        innerRule := &deviceRule{
                meta: deviceMeta{
                        node:  rule.Type,
                        major: rule.Major,
                        minor: rule.Minor,
                },
                perms: rule.Permissions,
        }
        if innerRule.meta.node == devices.WildcardDevice {
                innerRule = nil
        }

        if rule.Allow {
                return e.allow(innerRule)
        }

        return e.deny(innerRule)
}

// emulatorFromList takes a reader to a "devices.list"-like source, and returns
// a new Emulator that represents the state of the devices cgroup. Note that
// black-list devices cgroups cannot be fully reconstructed, due to limitations
// in the devices cgroup API. Instead, such cgroups are always treated as
// "allow all" cgroups.
func emulatorFromList(list io.Reader) (*emulator, error) {
        // Normally cgroups are in black-list mode by default, but the way we
        // figure out the current mode is whether or not devices.list has an
        // allow-all rule. So we default to a white-list, and the existence of an
        // "a *:* rwm" entry will tell us otherwise.
        e := &emulator{
                defaultAllow: false,
        }

        // Parse the "devices.list".
        s := bufio.NewScanner(list)
        for s.Scan() {
                line := s.Text()
                deviceRule, err := parseLine(line)
                if err != nil {
                        return nil, fmt.Errorf("error parsing line %q: %w", line, err)
                }
                // "devices.list" is an allow list. Note that this means that in
                // black-list mode, we have no idea what rules are in play. As a
                // result, we need to be very careful in Transition().
                if err := e.allow(deviceRule); err != nil {
                        return nil, fmt.Errorf("error adding devices.list rule: %w", err)
                }
        }
        if err := s.Err(); err != nil {
                return nil, fmt.Errorf("error reading devices.list lines: %w", err)
        }
        return e, nil
}

// Transition calculates what is the minimally-disruptive set of rules need to
// be applied to a devices cgroup in order to transition to the given target.
// This means that any already-existing rules will not be applied, and
// disruptive rules (like denying all device access) will only be applied if
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurious
// device errors (if possible).
func (source *emulator) Transition(target *emulator) ([]*devices.Rule, error) { //nolint:revive // Ignore receiver-naming warning.
        var transitionRules []*devices.Rule
        oldRules := source.rules

        // If the default policy doesn't match, we need to include a "disruptive"
        // rule (either allow-all or deny-all) in order to switch the cgroup to the
        // correct default policy.
        //
        // However, due to a limitation in "devices.list" we cannot be sure what
        // deny rules are in place in a black-list cgroup. Thus if the source is a
        // black-list we also have to include a disruptive rule.
        if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
                transitionRules = append(transitionRules, &devices.Rule{
                        Type:        'a',
                        Major:       -1,
                        Minor:       -1,
                        Permissions: devices.Permissions("rwm"),
                        Allow:       target.defaultAllow,
                })
                // The old rules are only relevant if we aren't starting out with a
                // disruptive rule.
                oldRules = nil
        }

        // NOTE: We traverse through the rules in a sorted order so we always write
        //       the same set of rules (this is to aid testing).

        // First, we create inverse rules for any old rules not in the new set.
        // This includes partial-inverse rules for specific permissions. This is a
        // no-op if we added a disruptive rule, since oldRules will be empty.
        for _, rule := range oldRules.orderedEntries() {
                meta, oldPerms := rule.meta, rule.perms
                newPerms := target.rules[meta]
                droppedPerms := oldPerms.Difference(newPerms)
                if !droppedPerms.IsEmpty() {
                        transitionRules = append(transitionRules, &devices.Rule{
                                Type:        meta.node,
                                Major:       meta.major,
                                Minor:       meta.minor,
                                Permissions: droppedPerms,
                                Allow:       target.defaultAllow,
                        })
                }
        }

        // Add any additional rules which weren't in the old set. We happen to
        // filter out rules which are present in both sets, though this isn't
        // strictly necessary.
        for _, rule := range target.rules.orderedEntries() {
                meta, newPerms := rule.meta, rule.perms
                oldPerms := oldRules[meta]
                gainedPerms := newPerms.Difference(oldPerms)
                if !gainedPerms.IsEmpty() {
                        transitionRules = append(transitionRules, &devices.Rule{
                                Type:        meta.node,
                                Major:       meta.major,
                                Minor:       meta.minor,
                                Permissions: gainedPerms,
                                Allow:       !target.defaultAllow,
                        })
                }
        }
        return transitionRules, nil
}

// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *emulator) Rules() ([]*devices.Rule, error) {
        defaultCgroup := &emulator{defaultAllow: false}
        return defaultCgroup.Transition(e)
}

func wrapErr(err error, text string) error {
        if err == nil {
                return nil
        }
        return fmt.Errorf(text+": %w", err)
}

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package devices

import (
        gofuzzheaders "github.com/AdaLogics/go-fuzz-headers"
        "strings"
)

func Fuzz(data []byte) int {
        c := gofuzzheaders.NewConsumer(data)
        str1, err := c.GetString()
        if err != nil {
                return -1
        }
        reader1 := strings.NewReader(str1)
        emu1, err := emulatorFromList(reader1)
        if err != nil {
                return -1
        }

        str2, err := c.GetString()
        if err != nil {
                return -1
        }
        reader2 := strings.NewReader(str2)
        emu2, err := emulatorFromList(reader2)
        if err != nil {
                return -1
        }
        emu1.Transition(emu2)
        return 1
}

package devices

import (
        "errors"
        "fmt"
        "os"
        "runtime"
        "sync"
        "unsafe"

        "github.com/cilium/ebpf"
        "github.com/cilium/ebpf/asm"
        "github.com/cilium/ebpf/link"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

func nilCloser() error {
        return nil
}

func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
        type bpfAttrQuery struct {
                TargetFd    uint32
                AttachType  uint32
                QueryType   uint32
                AttachFlags uint32
                ProgIds     uint64 // __aligned_u64
                ProgCnt     uint32
        }

        // Currently you can only have 64 eBPF programs attached to a cgroup.
        size := 64
        retries := 0
        for retries < 10 {
                progIds := make([]uint32, size)
                query := bpfAttrQuery{
                        TargetFd:   uint32(dirFd),
                        AttachType: uint32(unix.BPF_CGROUP_DEVICE),
                        ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
                        ProgCnt:    uint32(len(progIds)),
                }

                // Fetch the list of program ids.
                _, _, errno := unix.Syscall(unix.SYS_BPF,
                        uintptr(unix.BPF_PROG_QUERY),
                        uintptr(unsafe.Pointer(&query)),
                        unsafe.Sizeof(query))
                size = int(query.ProgCnt)
                runtime.KeepAlive(query)
                if errno != 0 {
                        // On ENOSPC we get the correct number of programs.
                        if errno == unix.ENOSPC {
                                retries++
                                continue
                        }
                        return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
                }

                // Convert the ids to program handles.
                progIds = progIds[:size]
                programs := make([]*ebpf.Program, 0, len(progIds))
                for _, progId := range progIds {
                        program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
                        if err != nil {
                                // We skip over programs that give us -EACCES or -EPERM. This
                                // is necessary because there may be BPF programs that have
                                // been attached (such as with --systemd-cgroup) which have an
                                // LSM label that blocks us from interacting with the program.
                                //
                                // Because additional BPF_CGROUP_DEVICE programs only can add
                                // restrictions, there's no real issue with just ignoring these
                                // programs (and stops runc from breaking on distributions with
                                // very strict SELinux policies).
                                if errors.Is(err, os.ErrPermission) {
                                        logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
                                        continue
                                }
                                return nil, fmt.Errorf("cannot fetch program from id: %w", err)
                        }
                        programs = append(programs, program)
                }
                runtime.KeepAlive(progIds)
                return programs, nil
        }

        return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}

var (
        haveBpfProgReplaceBool bool
        haveBpfProgReplaceOnce sync.Once
)

// Loosely based on the BPF_F_REPLACE support check in
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
        haveBpfProgReplaceOnce.Do(func() {
                prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
                        Type:    ebpf.CGroupDevice,
                        License: "MIT",
                        Instructions: asm.Instructions{
                                asm.Mov.Imm(asm.R0, 0),
                                asm.Return(),
                        },
                })
                if err != nil {
                        logrus.Warnf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
                        return
                }
                defer prog.Close()

                devnull, err := os.Open("/dev/null")
                if err != nil {
                        logrus.Warnf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
                        return
                }
                defer devnull.Close()

                // We know that we have BPF_PROG_ATTACH since we can load
                // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
                // we know that the feature isn't present.
                err = link.RawAttachProgram(link.RawAttachProgramOptions{
                        // We rely on this fd being checked after attachFlags in the kernel.
                        Target: int(devnull.Fd()),
                        // Attempt to "replace" our BPF program with itself. This will
                        // always fail, but we should get -EINVAL if BPF_F_REPLACE is not
                        // supported.
                        Anchor:  link.ReplaceProgram(prog),
                        Program: prog,
                        Attach:  ebpf.AttachCGroupDevice,
                        Flags:   unix.BPF_F_ALLOW_MULTI,
                })
                if errors.Is(err, ebpf.ErrNotSupported) || errors.Is(err, unix.EINVAL) {
                        // not supported
                        return
                }
                if !errors.Is(err, unix.EBADF) {
                        // If we see any new errors here, it's possible that there is a
                        // regression due to a cilium/ebpf update and the above EINVAL
                        // checks are not working. So, be loud about it so someone notices
                        // and we can get the issue fixed quicker.
                        logrus.Warnf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
                }
                haveBpfProgReplaceBool = true
        })
        return haveBpfProgReplaceBool
}

// loadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func loadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
        // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
        // This limit is not inherited into the container.
        memlockLimit := &unix.Rlimit{
                Cur: unix.RLIM_INFINITY,
                Max: unix.RLIM_INFINITY,
        }
        _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)

        // Get the list of existing programs.
        oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
        if err != nil {
                return nilCloser, err
        }
        useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1

        // Generate new program.
        spec := &ebpf.ProgramSpec{
                Type:         ebpf.CGroupDevice,
                Instructions: insts,
                License:      license,
        }
        prog, err := ebpf.NewProgram(spec)
        if err != nil {
                return nilCloser, err
        }

        // If there is only one old program, we can just replace it directly.

        attachProgramOptions := link.RawAttachProgramOptions{
                Target:  dirFd,
                Program: prog,
                Attach:  ebpf.AttachCGroupDevice,
                Flags:   unix.BPF_F_ALLOW_MULTI,
        }

        if useReplaceProg {
                attachProgramOptions.Anchor = link.ReplaceProgram(oldProgs[0])
        }
        err = link.RawAttachProgram(attachProgramOptions)
        if err != nil {
                return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
        }
        closer := func() error {
                err = link.RawDetachProgram(link.RawDetachProgramOptions{
                        Target:  dirFd,
                        Program: prog,
                        Attach:  ebpf.AttachCGroupDevice,
                })
                if err != nil {
                        return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
                }
                // TODO: Should we attach the old filters back in this case? Otherwise
                //       we fail-open on a security feature, which is a bit scary.
                return nil
        }
        if !useReplaceProg {
                logLevel := logrus.DebugLevel
                // If there was more than one old program, give a warning (since this
                // really shouldn't happen with runc-managed cgroups) and then detach
                // all the old programs.
                if len(oldProgs) > 1 {
                        // NOTE: Ideally this should be a warning but it turns out that
                        //       systemd-managed cgroups trigger this warning (apparently
                        //       systemd doesn't delete old non-systemd programs when
                        //       setting properties).
                        logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
                        logLevel = logrus.InfoLevel
                }
                for idx, oldProg := range oldProgs {
                        // Output some extra debug info.
                        if info, err := oldProg.Info(); err == nil {
                                fields := logrus.Fields{
                                        "type": info.Type.String(),
                                        "tag":  info.Tag,
                                        "name": info.Name,
                                }
                                if id, ok := info.ID(); ok {
                                        fields["id"] = id
                                }
                                if runCount, ok := info.RunCount(); ok {
                                        fields["run_count"] = runCount
                                }
                                if runtime, ok := info.Runtime(); ok {
                                        fields["runtime"] = runtime.String()
                                }
                                logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
                        }
                        err = link.RawDetachProgram(link.RawDetachProgramOptions{
                                Target:  dirFd,
                                Program: oldProg,
                                Attach:  ebpf.AttachCGroupDevice,
                        })
                        if err != nil {
                                return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
                        }
                }
        }
        return closer, nil
}

package devices

import (
        "bufio"
        "fmt"
        "os"
        "strconv"
        "strings"

        systemdDbus "github.com/coreos/go-systemd/v22/dbus"
        "github.com/godbus/dbus/v5"
        "github.com/sirupsen/logrus"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
)

// systemdProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func systemdProperties(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error) {
        if r.SkipDevices {
                return nil, nil
        }

        properties := []systemdDbus.Property{
                // When we later add DeviceAllow=/dev/foo properties, we are
                // appending devices to the allow list for the unit. However,
                // if this is an existing unit, it already has DeviceAllow=
                // entries, and we need to clear them all before applying the
                // new set. (We also do this for new units, mainly for safety
                // to ensure we only enable the devices we expect.)
                //
                // To clear any existing DeviceAllow= rules, we have to add an
                // empty DeviceAllow= property.
                newProp("DeviceAllow", []deviceAllowEntry{}),
                // Always run in the strictest white-list mode.
                newProp("DevicePolicy", "strict"),
        }

        // Figure out the set of rules.
        configEmu := emulator{}
        for _, rule := range r.Devices {
                if err := configEmu.Apply(*rule); err != nil {
                        return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
                }
        }
        // systemd doesn't support blacklists. So we log a warning, and tell
        // systemd to act as a deny-all whitelist. This ruleset will be replaced
        // with our normal fallback code. This may result in spurious errors, but
        // the only other option is to error out here.
        if configEmu.IsBlacklist() {
                // However, if we're dealing with an allow-all rule then we can do it.
                if configEmu.IsAllowAll() {
                        return allowAllDevices(), nil
                }
                logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
                return properties, nil
        }

        // Now generate the set of rules we actually need to apply. Unlike the
        // normal devices cgroup, in "strict" mode systemd defaults to a deny-all
        // whitelist which is the default for devices.Emulator.
        finalRules, err := configEmu.Rules()
        if err != nil {
                return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
        }
        var deviceAllowList []deviceAllowEntry
        for _, rule := range finalRules {
                if !rule.Allow {
                        // Should never happen.
                        return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
                }
                switch rule.Type {
                case devices.BlockDevice, devices.CharDevice:
                default:
                        // Should never happen.
                        return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
                }

                entry := deviceAllowEntry{
                        Perms: string(rule.Permissions),
                }

                // systemd has a fairly odd (though understandable) syntax here, and
                // because of the OCI configuration format we have to do quite a bit of
                // trickery to convert things:
                //
                //  * Concrete rules with non-wildcard major/minor numbers have to use
                //    /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
                //    stat(2) on such paths to look up device properties, meaning we
                //    cannot add whitelist rules for devices that don't exist. Since v240,
                //    device properties are parsed from the path string.
                //
                //    However, path globbing is not supported for path-based rules so we
                //    need to handle wildcards in some other manner.
                //
                //  * If systemd older than v240 is used, wildcard-minor rules
                //    have to specify a "device group name" (the second column
                //    in /proc/devices).
                //
                //  * Wildcard (major and minor) rules can just specify a glob with the
                //    type ("char-*" or "block-*").
                //
                // The only type of rule we can't handle is wildcard-major rules, and
                // so we'll give a warning in that case (note that the fallback code
                // will insert any rules systemd couldn't handle). What amazing fun.

                if rule.Major == devices.Wildcard {
                        // "_ *:n _" rules aren't supported by systemd.
                        if rule.Minor != devices.Wildcard {
                                logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
                                continue
                        }

                        // "_ *:* _" rules just wildcard everything.
                        prefix, err := groupPrefix(rule.Type)
                        if err != nil {
                                return nil, err
                        }
                        entry.Path = prefix + "*"
                } else if rule.Minor == devices.Wildcard {
                        if sdVer >= 240 {
                                // systemd v240+ allows for {block,char}-MAJOR syntax.
                                prefix, err := groupPrefix(rule.Type)
                                if err != nil {
                                        return nil, err
                                }
                                entry.Path = prefix + strconv.FormatInt(rule.Major, 10)
                        } else {
                                // For older systemd, "_ n:* _" rules require a device group from /proc/devices.
                                group, err := findDeviceGroup(rule.Type, rule.Major)
                                if err != nil {
                                        return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
                                }
                                if group == "" {
                                        // Couldn't find a group.
                                        logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
                                        continue
                                }
                                entry.Path = group
                        }
                } else {
                        // "_ n:m _" rules are just a path in /dev/{block,char}/.
                        switch rule.Type {
                        case devices.BlockDevice:
                                entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
                        case devices.CharDevice:
                                entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
                        }
                        if sdVer < 240 {
                                // Old systemd versions use stat(2) on path to find out device major:minor
                                // numbers and type. If the path doesn't exist, it will not add the rule,
                                // emitting a warning instead.
                                // Since all of this logic is best-effort anyway (we manually set these
                                // rules separately to systemd) we can safely skip entries that don't
                                // have a corresponding path.
                                if _, err := os.Stat(entry.Path); err != nil {
                                        continue
                                }
                        }
                }
                deviceAllowList = append(deviceAllowList, entry)
        }

        properties = append(properties, newProp("DeviceAllow", deviceAllowList))
        return properties, nil
}

func newProp(name string, units interface{}) systemdDbus.Property {
        return systemdDbus.Property{
                Name:  name,
                Value: dbus.MakeVariant(units),
        }
}

func groupPrefix(ruleType devices.Type) (string, error) {
        switch ruleType {
        case devices.BlockDevice:
                return "block-", nil
        case devices.CharDevice:
                return "char-", nil
        default:
                return "", fmt.Errorf("device type %v has no group prefix", ruleType)
        }
}

// findDeviceGroup tries to find the device group name (as listed in
// /proc/devices) with the type prefixed as required for DeviceAllow, for a
// given (type, major) combination. If more than one device group exists, an
// arbitrary one is chosen.
func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
        fh, err := os.Open("/proc/devices")
        if err != nil {
                return "", err
        }
        defer fh.Close()

        prefix, err := groupPrefix(ruleType)
        if err != nil {
                return "", err
        }
        ruleMajorStr := strconv.FormatInt(ruleMajor, 10) + " "

        scanner := bufio.NewScanner(fh)
        var currentType devices.Type
        for scanner.Scan() {
                // We need to strip spaces because the first number is column-aligned.
                line := strings.TrimSpace(scanner.Text())

                // Handle the "header" lines.
                switch line {
                case "Block devices:":
                        currentType = devices.BlockDevice
                        continue
                case "Character devices:":
                        currentType = devices.CharDevice
                        continue
                case "":
                        continue
                }

                // Skip lines unrelated to our type.
                if currentType != ruleType {
                        continue
                }

                if group, ok := strings.CutPrefix(line, ruleMajorStr); ok {
                        return prefix + group, nil
                }
        }
        if err := scanner.Err(); err != nil {
                return "", fmt.Errorf("reading /proc/devices: %w", err)
        }
        // Couldn't find the device group.
        return "", nil
}

// DeviceAllow is the dbus type "a(ss)" which means we need a struct
// to represent it in Go.
type deviceAllowEntry struct {
        Path  string
        Perms string
}

func allowAllDevices() []systemdDbus.Property {
        // Setting mode to auto and removing all DeviceAllow rules
        // results in allowing access to all devices.
        return []systemdDbus.Property{
                newProp("DeviceAllow", []deviceAllowEntry{}),
                newProp("DevicePolicy", "auto"),
        }
}

package devices

import (
        "bytes"
        "errors"
        "reflect"

        "github.com/moby/sys/userns"
        "github.com/opencontainers/runc/libcontainer/cgroups"
        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
)

var testingSkipFinalCheck bool

func setV1(path string, r *cgroups.Resources) error {
        if userns.RunningInUserNS() || r.SkipDevices {
                return nil
        }
        // Generate two emulators, one for the current state of the cgroup and one
        // for the requested state by the user.
        current, err := loadEmulator(path)
        if err != nil {
                return err
        }
        target, err := buildEmulator(r.Devices)
        if err != nil {
                return err
        }

        // Compute the minimal set of transition rules needed to achieve the
        // requested state.
        transitionRules, err := current.Transition(target)
        if err != nil {
                return err
        }
        for _, rule := range transitionRules {
                file := "devices.deny"
                if rule.Allow {
                        file = "devices.allow"
                }
                if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
                        return err
                }
        }

        // Final safety check -- ensure that the resulting state is what was
        // requested. This is only really correct for white-lists, but for
        // black-lists we can at least check that the cgroup is in the right mode.
        //
        // This safety-check is skipped for the unit tests because we cannot
        // currently mock devices.list correctly.
        if !testingSkipFinalCheck {
                currentAfter, err := loadEmulator(path)
                if err != nil {
                        return err
                }
                if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
                        return errors.New("resulting devices cgroup doesn't precisely match target")
                } else if target.IsBlacklist() != currentAfter.IsBlacklist() {
                        return errors.New("resulting devices cgroup doesn't match target mode")
                }
        }
        return nil
}

func loadEmulator(path string) (*emulator, error) {
        list, err := cgroups.ReadFile(path, "devices.list")
        if err != nil {
                return nil, err
        }
        return emulatorFromList(bytes.NewBufferString(list))
}

func buildEmulator(rules []*devices.Rule) (*emulator, error) {
        // This defaults to a white-list -- which is what we want!
        emu := &emulator{}
        for _, rule := range rules {
                if err := emu.Apply(*rule); err != nil {
                        return nil, err
                }
        }
        return emu, nil
}

package devices

import (
        "fmt"

        "github.com/moby/sys/userns"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
)

func isRWM(perms devices.Permissions) bool {
        var r, w, m bool
        for _, perm := range perms {
                switch perm {
                case 'r':
                        r = true
                case 'w':
                        w = true
                case 'm':
                        m = true
                }
        }
        return r && w && m
}

// This is similar to the logic applied in crun for handling errors from bpf(2)
// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
func canSkipEBPFError(r *cgroups.Resources) bool {
        // If we're running in a user namespace we can ignore eBPF rules because we
        // usually cannot use bpf(2), as well as rootless containers usually don't
        // have the necessary privileges to mknod(2) device inodes or access
        // host-level instances (though ideally we would be blocking device access
        // for rootless containers anyway).
        if userns.RunningInUserNS() {
                return true
        }

        // We cannot ignore an eBPF load error if any rule if is a block rule or it
        // doesn't permit all access modes.
        //
        // NOTE: This will sometimes trigger in cases where access modes are split
        //       between different rules but to handle this correctly would require
        //       using ".../libcontainer/cgroup/devices".Emulator.
        for _, dev := range r.Devices {
                if !dev.Allow || !isRWM(dev.Permissions) {
                        return false
                }
        }
        return true
}

func setV2(dirPath string, r *cgroups.Resources) error {
        if r.SkipDevices {
                return nil
        }
        insts, license, err := deviceFilter(r.Devices)
        if err != nil {
                return err
        }
        dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
        if err != nil {
                return fmt.Errorf("cannot get dir FD for %s", dirPath)
        }
        defer unix.Close(dirFD)
        if _, err := loadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
                if !canSkipEBPFError(r) {
                        return err
                }
        }
        return nil
}

package cgroups

import (
        "bytes"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"

        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only, and returns
// an error if the file is not a cgroup file.
//
// Arguments dir and file are joined together to form an absolute path
// to a file being opened.
func OpenFile(dir, file string, flags int) (*os.File, error) {
        if dir == "" {
                return nil, fmt.Errorf("no directory specified for %s", file)
        }
        return openFile(dir, file, flags)
}

// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
        fd, err := OpenFile(dir, file, unix.O_RDONLY)
        if err != nil {
                return "", err
        }
        defer fd.Close()
        var buf bytes.Buffer

        _, err = buf.ReadFrom(fd)
        return buf.String(), err
}

// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
        fd, err := OpenFile(dir, file, unix.O_WRONLY)
        if err != nil {
                return err
        }
        defer fd.Close()
        if _, err := fd.WriteString(data); err != nil {
                // Having data in the error message helps in debugging.
                return fmt.Errorf("failed to write %q: %w", data, err)
        }
        return nil
}

// WriteFileByLine is the same as WriteFile, except if data contains newlines,
// it is written line by line.
func WriteFileByLine(dir, file, data string) error {
        i := strings.Index(data, "\n")
        if i == -1 {
                return WriteFile(dir, file, data)
        }

        fd, err := OpenFile(dir, file, unix.O_WRONLY)
        if err != nil {
                return err
        }
        defer fd.Close()
        start := 0
        for {
                var line string
                if i == -1 {
                        line = data[start:]
                } else {
                        line = data[start : start+i+1]
                }
                _, err := fd.WriteString(line)
                if err != nil {
                        return fmt.Errorf("failed to write %q: %w", line, err)
                }
                if i == -1 {
                        break
                }
                start += i + 1
                i = strings.Index(data[start:], "\n")
        }
        return nil
}

const (
        cgroupfsDir    = "/sys/fs/cgroup"
        cgroupfsPrefix = cgroupfsDir + "/"
)

var (
        // TestMode is set to true by unit tests that need "fake" cgroupfs.
        TestMode bool

        cgroupRootHandle *os.File
        prepOnce         sync.Once
        prepErr          error
        resolveFlags     uint64
)

func prepareOpenat2() error {
        prepOnce.Do(func() {
                fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
                        Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
                })
                if err != nil {
                        prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
                        if err != unix.ENOSYS {
                                logrus.Warnf("falling back to securejoin: %s", prepErr)
                        } else {
                                logrus.Debug("openat2 not available, falling back to securejoin")
                        }
                        return
                }
                file := os.NewFile(uintptr(fd), cgroupfsDir)

                var st unix.Statfs_t
                if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
                        prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
                        logrus.Warnf("falling back to securejoin: %s", prepErr)
                        return
                }

                cgroupRootHandle = file
                resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
                if st.Type == unix.CGROUP2_SUPER_MAGIC {
                        // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
                        resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
                }
        })

        return prepErr
}

func openFile(dir, file string, flags int) (*os.File, error) {
        mode := os.FileMode(0)
        if TestMode && flags&os.O_WRONLY != 0 {
                // "emulate" cgroup fs for unit tests
                flags |= os.O_TRUNC | os.O_CREATE
                mode = 0o600
        }
        // NOTE it is important to use filepath.Clean("/"+file) here
        // (see https://github.com/opencontainers/runc/issues/4103)!
        path := filepath.Join(dir, filepath.Clean("/"+file))

        if prepareOpenat2() != nil {
                return openFallback(path, flags, mode)
        }
        relPath, ok := strings.CutPrefix(path, cgroupfsPrefix)
        if !ok { // Non-standard path, old system?
                return openFallback(path, flags, mode)
        }

        fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
                &unix.OpenHow{
                        Resolve: resolveFlags,
                        Flags:   uint64(flags) | unix.O_CLOEXEC,
                        Mode:    uint64(mode),
                })
        if err != nil {
                err = &os.PathError{Op: "openat2", Path: path, Err: err}
                // Check if cgroupRootHandle is still opened to cgroupfsDir
                // (happens when this package is incorrectly used
                // across the chroot/pivot_root/mntns boundary, or
                // when /sys/fs/cgroup is remounted).
                //
                // TODO: if such usage will ever be common, amend this
                // to reopen cgroupRootHandle and retry openat2.
                fdDest, fdErr := os.Readlink("/proc/thread-self/fd/" + strconv.Itoa(int(cgroupRootHandle.Fd())))
                if fdErr == nil && fdDest != cgroupfsDir {
                        // Wrap the error so it is clear that cgroupRootHandle
                        // is opened to an unexpected/wrong directory.
                        err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
                                cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
                }
                return nil, err
        }

        return os.NewFile(uintptr(fd), path), nil
}

var errNotCgroupfs = errors.New("not a cgroup file")

// Can be changed by unit tests.
var openFallback = openAndCheck

// openAndCheck is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
        fd, err := os.OpenFile(path, flags, mode)
        if err != nil {
                return nil, err
        }
        if TestMode {
                return fd, nil
        }
        // Check this is a cgroupfs file.
        var st unix.Statfs_t
        if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
                _ = fd.Close()
                return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
        }
        if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
                _ = fd.Close()
                return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
        }

        return fd, nil
}

package fs

import (
        "bufio"
        "os"
        "path/filepath"
        "strconv"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

type BlkioGroup struct {
        weightFilename       string
        weightDeviceFilename string
}

func (s *BlkioGroup) Name() string {
        return "blkio"
}

func (s *BlkioGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *BlkioGroup) Set(path string, r *cgroups.Resources) error {
        s.detectWeightFilenames(path)
        if r.BlkioWeight != 0 {
                if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
                        return err
                }
        }

        if r.BlkioLeafWeight != 0 {
                if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
                        return err
                }
        }
        for _, wd := range r.BlkioWeightDevice {
                if wd.Weight != 0 {
                        if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
                                return err
                        }
                }
                if wd.LeafWeight != 0 {
                        if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
                                return err
                        }
                }
        }
        for _, td := range r.BlkioThrottleReadBpsDevice {
                if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
                        return err
                }
        }
        for _, td := range r.BlkioThrottleWriteBpsDevice {
                if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
                        return err
                }
        }
        for _, td := range r.BlkioThrottleReadIOPSDevice {
                if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
                        return err
                }
        }
        for _, td := range r.BlkioThrottleWriteIOPSDevice {
                if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
                        return err
                }
        }

        return nil
}

/*
examples:

    blkio.sectors
    8:0 6792

    blkio.io_service_bytes
    8:0 Read 1282048
    8:0 Write 2195456
    8:0 Sync 2195456
    8:0 Async 1282048
    8:0 Total 3477504
    Total 3477504

    blkio.io_serviced
    8:0 Read 124
    8:0 Write 104
    8:0 Sync 104
    8:0 Async 124
    8:0 Total 228
    Total 228

    blkio.io_queued
    8:0 Read 0
    8:0 Write 0
    8:0 Sync 0
    8:0 Async 0
    8:0 Total 0
    Total 0
*/

func splitBlkioStatLine(r rune) bool {
        return r == ' ' || r == ':'
}

func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
        var blkioStats []cgroups.BlkioStatEntry
        f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
        if err != nil {
                if os.IsNotExist(err) {
                        return blkioStats, nil
                }
                return nil, err
        }
        defer f.Close()

        sc := bufio.NewScanner(f)
        for sc.Scan() {
                // format: dev type amount
                fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
                if len(fields) < 3 {
                        if len(fields) == 2 && fields[0] == "Total" {
                                // skip total line
                                continue
                        } else {
                                return nil, malformedLine(dir, file, sc.Text())
                        }
                }

                v, err := strconv.ParseUint(fields[0], 10, 64)
                if err != nil {
                        return nil, &parseError{Path: dir, File: file, Err: err}
                }
                major := v

                v, err = strconv.ParseUint(fields[1], 10, 64)
                if err != nil {
                        return nil, &parseError{Path: dir, File: file, Err: err}
                }
                minor := v

                op := ""
                valueField := 2
                if len(fields) == 4 {
                        op = fields[2]
                        valueField = 3
                }
                v, err = strconv.ParseUint(fields[valueField], 10, 64)
                if err != nil {
                        return nil, &parseError{Path: dir, File: file, Err: err}
                }
                blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
        }
        if err := sc.Err(); err != nil {
                return nil, &parseError{Path: dir, File: file, Err: err}
        }

        return blkioStats, nil
}

func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
        type blkioStatInfo struct {
                filename            string
                blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
        }
        bfqDebugStats := []blkioStatInfo{
                {
                        filename:            "blkio.bfq.sectors_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
                },
                {
                        filename:            "blkio.bfq.io_service_time_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
                },
                {
                        filename:            "blkio.bfq.io_wait_time_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
                },
                {
                        filename:            "blkio.bfq.io_merged_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
                },
                {
                        filename:            "blkio.bfq.io_queued_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
                },
                {
                        filename:            "blkio.bfq.time_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
                },
                {
                        filename:            "blkio.bfq.io_serviced_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
                },
                {
                        filename:            "blkio.bfq.io_service_bytes_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
                },
        }
        bfqStats := []blkioStatInfo{
                {
                        filename:            "blkio.bfq.io_serviced_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
                },
                {
                        filename:            "blkio.bfq.io_service_bytes_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
                },
        }
        cfqStats := []blkioStatInfo{
                {
                        filename:            "blkio.sectors_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
                },
                {
                        filename:            "blkio.io_service_time_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
                },
                {
                        filename:            "blkio.io_wait_time_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
                },
                {
                        filename:            "blkio.io_merged_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
                },
                {
                        filename:            "blkio.io_queued_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
                },
                {
                        filename:            "blkio.time_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
                },
                {
                        filename:            "blkio.io_serviced_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
                },
                {
                        filename:            "blkio.io_service_bytes_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
                },
        }
        throttleRecursiveStats := []blkioStatInfo{
                {
                        filename:            "blkio.throttle.io_serviced_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
                },
                {
                        filename:            "blkio.throttle.io_service_bytes_recursive",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
                },
        }
        baseStats := []blkioStatInfo{
                {
                        filename:            "blkio.throttle.io_serviced",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
                },
                {
                        filename:            "blkio.throttle.io_service_bytes",
                        blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
                },
        }
        orderedStats := [][]blkioStatInfo{
                bfqDebugStats,
                bfqStats,
                cfqStats,
                throttleRecursiveStats,
                baseStats,
        }

        var blkioStats []cgroups.BlkioStatEntry
        var err error

        for _, statGroup := range orderedStats {
                for i, statInfo := range statGroup {
                        if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
                                // if error occurs on first file, move to next group
                                if i == 0 {
                                        break
                                }
                                return err
                        }
                        *statInfo.blkioStatEntriesPtr = blkioStats
                        // finish if all stats are gathered
                        if i == len(statGroup)-1 {
                                return nil
                        }
                }
        }
        return nil
}

func (s *BlkioGroup) detectWeightFilenames(path string) {
        if s.weightFilename != "" {
                // Already detected.
                return
        }
        if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
                s.weightFilename = "blkio.weight"
                s.weightDeviceFilename = "blkio.weight_device"
        } else {
                s.weightFilename = "blkio.bfq.weight"
                s.weightDeviceFilename = "blkio.bfq.weight_device"
        }
}

package fs

import (
        "bufio"
        "errors"
        "fmt"
        "os"
        "strconv"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
        "golang.org/x/sys/unix"
)

type CpuGroup struct{}

func (s *CpuGroup) Name() string {
        return "cpu"
}

func (s *CpuGroup) Apply(path string, r *cgroups.Resources, pid int) error {
        if err := os.MkdirAll(path, 0o755); err != nil {
                return err
        }
        // We should set the real-Time group scheduling settings before moving
        // in the process because if the process is already in SCHED_RR mode
        // and no RT bandwidth is set, adding it will fail.
        if err := s.SetRtSched(path, r); err != nil {
                return err
        }
        // Since we are not using apply(), we need to place the pid
        // into the procs file.
        return cgroups.WriteCgroupProc(path, pid)
}

func (s *CpuGroup) SetRtSched(path string, r *cgroups.Resources) error {
        var period string
        if r.CpuRtPeriod != 0 {
                period = strconv.FormatUint(r.CpuRtPeriod, 10)
                if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil {
                        // The values of cpu.rt_period_us and cpu.rt_runtime_us
                        // are inter-dependent and need to be set in a proper order.
                        // If the kernel rejects the new period value with EINVAL
                        // and the new runtime value is also being set, let's
                        // ignore the error for now and retry later.
                        if !errors.Is(err, unix.EINVAL) || r.CpuRtRuntime == 0 {
                                return err
                        }
                } else {
                        period = ""
                }
        }
        if r.CpuRtRuntime != 0 {
                if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
                        return err
                }
                if period != "" {
                        if err := cgroups.WriteFile(path, "cpu.rt_period_us", period); err != nil {
                                return err
                        }
                }
        }
        return nil
}

func (s *CpuGroup) Set(path string, r *cgroups.Resources) error {
        if r.CpuShares != 0 {
                shares := r.CpuShares
                if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
                        return err
                }
                // read it back
                sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
                if err != nil {
                        return err
                }
                // ... and check
                if shares > sharesRead {
                        return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
                } else if shares < sharesRead {
                        return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
                }
        }

        var period string
        if r.CpuPeriod != 0 {
                period = strconv.FormatUint(r.CpuPeriod, 10)
                if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
                        // Sometimes when the period to be set is smaller
                        // than the current one, it is rejected by the kernel
                        // (EINVAL) as old_quota/new_period exceeds the parent
                        // cgroup quota limit. If this happens and the quota is
                        // going to be set, ignore the error for now and retry
                        // after setting the quota.
                        if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
                                return err
                        }
                } else {
                        period = ""
                }
        }

        var burst string
        if r.CpuBurst != nil {
                burst = strconv.FormatUint(*r.CpuBurst, 10)
                if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
                        if errors.Is(err, unix.ENOENT) {
                                // If CPU burst knob is not available (e.g.
                                // older kernel), ignore it.
                                burst = ""
                        } else {
                                // Sometimes when the burst to be set is larger
                                // than the current one, it is rejected by the kernel
                                // (EINVAL) as old_quota/new_burst exceeds the parent
                                // cgroup quota limit. If this happens and the quota is
                                // going to be set, ignore the error for now and retry
                                // after setting the quota.
                                if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
                                        return err
                                }
                        }
                } else {
                        burst = ""
                }
        }
        if r.CpuQuota != 0 {
                if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
                        return err
                }
                if period != "" {
                        if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
                                return err
                        }
                }
                if burst != "" {
                        if err := cgroups.WriteFile(path, "cpu.cfs_burst_us", burst); err != nil {
                                return err
                        }
                }
        }

        if r.CPUIdle != nil {
                idle := strconv.FormatInt(*r.CPUIdle, 10)
                if err := cgroups.WriteFile(path, "cpu.idle", idle); err != nil {
                        return err
                }
        }

        return s.SetRtSched(path, r)
}

func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
        const file = "cpu.stat"
        f, err := cgroups.OpenFile(path, file, os.O_RDONLY)
        if err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return err
        }
        defer f.Close()

        sc := bufio.NewScanner(f)
        for sc.Scan() {
                t, v, err := fscommon.ParseKeyValue(sc.Text())
                if err != nil {
                        return &parseError{Path: path, File: file, Err: err}
                }
                switch t {
                case "nr_periods":
                        stats.CpuStats.ThrottlingData.Periods = v

                case "nr_throttled":
                        stats.CpuStats.ThrottlingData.ThrottledPeriods = v

                case "throttled_time":
                        stats.CpuStats.ThrottlingData.ThrottledTime = v
                }
        }
        return nil
}

package fs

import (
        "bufio"
        "os"
        "strconv"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

const (
        nsInSec = 1000000000

        // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
        // on Linux it's a constant which is safe to be hard coded,
        // so we can avoid using cgo here. For details, see:
        // https://github.com/containerd/cgroups/pull/12
        clockTicks uint64 = 100
)

type CpuacctGroup struct{}

func (s *CpuacctGroup) Name() string {
        return "cpuacct"
}

func (s *CpuacctGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *CpuacctGroup) Set(_ string, _ *cgroups.Resources) error {
        return nil
}

func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
        if !cgroups.PathExists(path) {
                return nil
        }
        userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
        if err != nil {
                return err
        }

        totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage")
        if err != nil {
                return err
        }

        percpuUsage, err := getPercpuUsage(path)
        if err != nil {
                return err
        }

        percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
        if err != nil {
                return err
        }

        stats.CpuStats.CpuUsage.TotalUsage = totalUsage
        stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
        stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
        stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
        stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
        stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
        return nil
}

// Returns user and kernel usage breakdown in nanoseconds.
func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
        var userModeUsage, kernelModeUsage uint64
        const (
                userField   = "user"
                systemField = "system"
                file        = "cpuacct.stat"
        )

        // Expected format:
        // user <usage in ticks>
        // system <usage in ticks>
        data, err := cgroups.ReadFile(path, file)
        if err != nil {
                return 0, 0, err
        }

        fields := strings.Fields(data)
        if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
                return 0, 0, malformedLine(path, file, data)
        }
        if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
                return 0, 0, &parseError{Path: path, File: file, Err: err}
        }
        if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
                return 0, 0, &parseError{Path: path, File: file, Err: err}
        }

        return (userModeUsage * nsInSec) / clockTicks, (kernelModeUsage * nsInSec) / clockTicks, nil
}

func getPercpuUsage(path string) ([]uint64, error) {
        const file = "cpuacct.usage_percpu"
        percpuUsage := []uint64{}
        data, err := cgroups.ReadFile(path, file)
        if err != nil {
                return percpuUsage, err
        }
        for _, value := range strings.Fields(data) {
                value, err := strconv.ParseUint(value, 10, 64)
                if err != nil {
                        return percpuUsage, &parseError{Path: path, File: file, Err: err}
                }
                percpuUsage = append(percpuUsage, value)
        }
        return percpuUsage, nil
}

func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
        usageKernelMode := []uint64{}
        usageUserMode := []uint64{}
        const file = "cpuacct.usage_all"

        fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
        if os.IsNotExist(err) {
                return usageKernelMode, usageUserMode, nil
        } else if err != nil {
                return nil, nil, err
        }
        defer fd.Close()

        scanner := bufio.NewScanner(fd)
        scanner.Scan() // skipping header line

        for scanner.Scan() {
                // Each line is: cpu user system
                fields := strings.SplitN(scanner.Text(), " ", 3)
                if len(fields) != 3 {
                        continue
                }

                user, err := strconv.ParseUint(fields[1], 10, 64)
                if err != nil {
                        return nil, nil, &parseError{Path: path, File: file, Err: err}
                }
                usageUserMode = append(usageUserMode, user)

                kernel, err := strconv.ParseUint(fields[2], 10, 64)
                if err != nil {
                        return nil, nil, &parseError{Path: path, File: file, Err: err}
                }
                usageKernelMode = append(usageKernelMode, kernel)
        }
        if err := scanner.Err(); err != nil {
                return nil, nil, &parseError{Path: path, File: file, Err: err}
        }

        return usageKernelMode, usageUserMode, nil
}

package fs

import (
        "errors"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

var (
        cpusetLock     sync.Mutex
        cpusetPrefix   = "cpuset."
        cpusetFastPath bool
)

func cpusetFile(path string, name string) string {
        cpusetLock.Lock()
        defer cpusetLock.Unlock()

        // Only the v1 cpuset cgroup is allowed to mount with noprefix.
        // See kernel source: https://github.com/torvalds/linux/blob/2e1b3cc9d7f790145a80cb705b168f05dab65df2/kernel/cgroup/cgroup-v1.c#L1070
        // Cpuset cannot be mounted with and without prefix simultaneously.
        // Commonly used in Android environments.

        if cpusetFastPath {
                return cpusetPrefix + name
        }

        err := unix.Access(filepath.Join(path, cpusetPrefix+name), unix.F_OK)
        if err == nil {
                // Use the fast path only if we can access one type of mount for cpuset already
                cpusetFastPath = true
        } else {
                err = unix.Access(filepath.Join(path, name), unix.F_OK)
                if err == nil {
                        cpusetPrefix = ""
                        cpusetFastPath = true
                }
        }

        return cpusetPrefix + name
}

type CpusetGroup struct{}

func (s *CpusetGroup) Name() string {
        return "cpuset"
}

func (s *CpusetGroup) Apply(path string, r *cgroups.Resources, pid int) error {
        return s.ApplyDir(path, r, pid)
}

func (s *CpusetGroup) Set(path string, r *cgroups.Resources) error {
        if r.CpusetCpus != "" {
                if err := cgroups.WriteFile(path, cpusetFile(path, "cpus"), r.CpusetCpus); err != nil {
                        return err
                }
        }
        if r.CpusetMems != "" {
                if err := cgroups.WriteFile(path, cpusetFile(path, "mems"), r.CpusetMems); err != nil {
                        return err
                }
        }
        return nil
}

func getCpusetStat(path string, file string) ([]uint16, error) {
        var extracted []uint16
        fileContent, err := fscommon.GetCgroupParamString(path, file)
        if err != nil {
                return extracted, err
        }
        if len(fileContent) == 0 {
                return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")}
        }

        for _, s := range strings.Split(fileContent, ",") {
                fromStr, toStr, ok := strings.Cut(s, "-")
                if ok {
                        from, err := strconv.ParseUint(fromStr, 10, 16)
                        if err != nil {
                                return extracted, &parseError{Path: path, File: file, Err: err}
                        }
                        to, err := strconv.ParseUint(toStr, 10, 16)
                        if err != nil {
                                return extracted, &parseError{Path: path, File: file, Err: err}
                        }
                        if from > to {
                                return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, from > to")}
                        }
                        for i := from; i <= to; i++ {
                                extracted = append(extracted, uint16(i))
                        }
                } else {
                        value, err := strconv.ParseUint(s, 10, 16)
                        if err != nil {
                                return extracted, &parseError{Path: path, File: file, Err: err}
                        }
                        extracted = append(extracted, uint16(value))
                }
        }

        return extracted, nil
}

func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
        var err error

        stats.CPUSetStats.CPUs, err = getCpusetStat(path, cpusetFile(path, "cpus"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "cpu_exclusive"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.Mems, err = getCpusetStat(path, cpusetFile(path, "mems"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_hardwall"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "mem_exclusive"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_migrate"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_page"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_spread_slab"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "memory_pressure"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, cpusetFile(path, "sched_load_balance"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, cpusetFile(path, "sched_relax_domain_level"))
        if err != nil && !errors.Is(err, os.ErrNotExist) {
                return err
        }

        return nil
}

func (s *CpusetGroup) ApplyDir(dir string, r *cgroups.Resources, pid int) error {
        // This might happen if we have no cpuset cgroup mounted.
        // Just do nothing and don't fail.
        if dir == "" {
                return nil
        }
        // 'ensureParent' start with parent because we don't want to
        // explicitly inherit from parent, it could conflict with
        // 'cpuset.cpu_exclusive'.
        if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
                return err
        }
        if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
                return err
        }
        // We didn't inherit cpuset configs from parent, but we have
        // to ensure cpuset configs are set before moving task into the
        // cgroup.
        // The logic is, if user specified cpuset configs, use these
        // specified configs, otherwise, inherit from parent. This makes
        // cpuset configs work correctly with 'cpuset.cpu_exclusive', and
        // keep backward compatibility.
        if err := s.ensureCpusAndMems(dir, r); err != nil {
                return err
        }
        // Since we are not using apply(), we need to place the pid
        // into the procs file.
        return cgroups.WriteCgroupProc(dir, pid)
}

func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
        if cpus, err = cgroups.ReadFile(parent, cpusetFile(parent, "cpus")); err != nil {
                return
        }
        if mems, err = cgroups.ReadFile(parent, cpusetFile(parent, "mems")); err != nil {
                return
        }
        return cpus, mems, nil
}

// cpusetEnsureParent makes sure that the parent directories of current
// are created and populated with the proper cpus and mems files copied
// from their respective parent. It does that recursively, starting from
// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
func cpusetEnsureParent(current string) error {
        var st unix.Statfs_t

        parent := filepath.Dir(current)
        err := unix.Statfs(parent, &st)
        if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
                return nil
        }
        // Treat non-existing directory as cgroupfs as it will be created,
        // and the root cpuset directory obviously exists.
        if err != nil && err != unix.ENOENT {
                return &os.PathError{Op: "statfs", Path: parent, Err: err}
        }

        if err := cpusetEnsureParent(parent); err != nil {
                return err
        }
        if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
                return err
        }
        return cpusetCopyIfNeeded(current, parent)
}

// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func cpusetCopyIfNeeded(current, parent string) error {
        currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
        if err != nil {
                return err
        }
        parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
        if err != nil {
                return err
        }

        if isEmptyCpuset(currentCpus) {
                if err := cgroups.WriteFile(current, cpusetFile(current, "cpus"), parentCpus); err != nil {
                        return err
                }
        }
        if isEmptyCpuset(currentMems) {
                if err := cgroups.WriteFile(current, cpusetFile(current, "mems"), parentMems); err != nil {
                        return err
                }
        }
        return nil
}

func isEmptyCpuset(str string) bool {
        return str == "" || str == "\n"
}

func (s *CpusetGroup) ensureCpusAndMems(path string, r *cgroups.Resources) error {
        if err := s.Set(path, r); err != nil {
                return err
        }
        return cpusetCopyIfNeeded(path, filepath.Dir(path))
}

package fs

import (
        "github.com/opencontainers/runc/libcontainer/cgroups"
)

type DevicesGroup struct{}

func (s *DevicesGroup) Name() string {
        return "devices"
}

func (s *DevicesGroup) Apply(path string, r *cgroups.Resources, pid int) error {
        if r.SkipDevices {
                return nil
        }
        if path == "" {
                // Return error here, since devices cgroup
                // is a hard requirement for container's security.
                return errSubsystemDoesNotExist
        }

        return apply(path, pid)
}

func (s *DevicesGroup) Set(path string, r *cgroups.Resources) error {
        if cgroups.DevicesSetV1 == nil {
                if len(r.Devices) == 0 {
                        return nil
                }
                return cgroups.ErrDevicesUnsupported
        }
        return cgroups.DevicesSetV1(path, r)
}

func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
        return nil
}

package fs

import (
        "fmt"

        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

type parseError = fscommon.ParseError

// malformedLine is used by all cgroupfs file parsers that expect a line
// in a particular format but get some garbage instead.
func malformedLine(path, file, line string) error {
        return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)}
}

package fs

import (
        "errors"
        "fmt"
        "os"
        "strings"
        "time"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

type FreezerGroup struct{}

func (s *FreezerGroup) Name() string {
        return "freezer"
}

func (s *FreezerGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *FreezerGroup) Set(path string, r *cgroups.Resources) (Err error) {
        switch r.Freezer {
        case cgroups.Frozen:
                defer func() {
                        if Err != nil {
                                // Freezing failed, and it is bad and dangerous
                                // to leave the cgroup in FROZEN or FREEZING
                                // state, so (try to) thaw it back.
                                _ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed))
                        }
                }()

                // As per older kernel docs (freezer-subsystem.txt before
                // kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
                // userspace should either retry or thaw. While current
                // kernel cgroup v1 docs no longer mention a need to retry,
                // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
                // freeze a cgroup v1 while new processes keep appearing in it
                // (either via fork/clone or by writing new PIDs to
                // cgroup.procs).
                //
                // The numbers below are empirically chosen to have a decent
                // chance to succeed in various scenarios ("runc pause/unpause
                // with parallel runc exec" and "bare freeze/unfreeze on a very
                // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
                //
                // Adding any amount of sleep in between retries did not
                // increase the chances of successful freeze in "pause/unpause
                // with parallel exec" reproducer. OTOH, adding an occasional
                // sleep helped for the case where the system is extremely slow
                // (CentOS 7 VM on GHA CI).
                //
                // Alas, this is still a game of chances, since the real fix
                // belong to the kernel (cgroup v2 do not have this bug).

                for i := 0; i < 1000; i++ {
                        if i%50 == 49 {
                                // Occasional thaw and sleep improves
                                // the chances to succeed in freezing
                                // in case new processes keep appearing
                                // in the cgroup.
                                _ = cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed))
                                time.Sleep(10 * time.Millisecond)
                        }

                        if err := cgroups.WriteFile(path, "freezer.state", string(cgroups.Frozen)); err != nil {
                                return err
                        }

                        if i%25 == 24 {
                                // Occasional short sleep before reading
                                // the state back also improves the chances to
                                // succeed in freezing in case of a very slow
                                // system.
                                time.Sleep(10 * time.Microsecond)
                        }
                        state, err := cgroups.ReadFile(path, "freezer.state")
                        if err != nil {
                                return err
                        }
                        state = strings.TrimSpace(state)
                        switch state {
                        case "FREEZING":
                                continue
                        case string(cgroups.Frozen):
                                if i > 1 {
                                        logrus.Debugf("frozen after %d retries", i)
                                }
                                return nil
                        default:
                                // should never happen
                                return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
                        }
                }
                // Despite our best efforts, it got stuck in FREEZING.
                return errors.New("unable to freeze")
        case cgroups.Thawed:
                return cgroups.WriteFile(path, "freezer.state", string(cgroups.Thawed))
        case cgroups.Undefined:
                return nil
        default:
                return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
        }
}

func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
        return nil
}

func (s *FreezerGroup) GetState(path string) (cgroups.FreezerState, error) {
        for {
                state, err := cgroups.ReadFile(path, "freezer.state")
                if err != nil {
                        // If the kernel is too old, then we just treat the freezer as
                        // being in an "undefined" state.
                        if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
                                err = nil
                        }
                        return cgroups.Undefined, err
                }
                switch strings.TrimSpace(state) {
                case "THAWED":
                        return cgroups.Thawed, nil
                case "FROZEN":
                        // Find out whether the cgroup is frozen directly,
                        // or indirectly via an ancestor.
                        self, err := cgroups.ReadFile(path, "freezer.self_freezing")
                        if err != nil {
                                // If the kernel is too old, then we just treat
                                // it as being frozen.
                                if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
                                        err = nil
                                }
                                return cgroups.Frozen, err
                        }
                        switch self {
                        case "0\n":
                                return cgroups.Thawed, nil
                        case "1\n":
                                return cgroups.Frozen, nil
                        default:
                                return cgroups.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
                        }
                case "FREEZING":
                        // Make sure we get a stable freezer state, so retry if the cgroup
                        // is still undergoing freezing. This should be a temporary delay.
                        time.Sleep(1 * time.Millisecond)
                        continue
                default:
                        return cgroups.Undefined, fmt.Errorf("unknown freezer.state %q", state)
                }
        }
}

package fs

import (
        "errors"
        "fmt"
        "os"
        "sync"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

var subsystems = []subsystem{
        &CpusetGroup{},
        &DevicesGroup{},
        &MemoryGroup{},
        &CpuGroup{},
        &CpuacctGroup{},
        &PidsGroup{},
        &BlkioGroup{},
        &HugetlbGroup{},
        &NetClsGroup{},
        &NetPrioGroup{},
        &PerfEventGroup{},
        &FreezerGroup{},
        &RdmaGroup{},
        &NameGroup{GroupName: "name=systemd", Join: true},
        &NameGroup{GroupName: "misc", Join: true},
}

var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")

func init() {
        // If using cgroups-hybrid mode then add a "" controller indicating
        // it should join the cgroups v2.
        if cgroups.IsCgroup2HybridMode() {
                subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true})
        }
}

type subsystem interface {
        // Name returns the name of the subsystem.
        Name() string
        // GetStats fills in the stats for the subsystem.
        GetStats(path string, stats *cgroups.Stats) error
        // Apply creates and joins a cgroup, adding pid into it. Some
        // subsystems use resources to pre-configure the cgroup parents
        // before creating or joining it.
        Apply(path string, r *cgroups.Resources, pid int) error
        // Set sets the cgroup resources.
        Set(path string, r *cgroups.Resources) error
}

type Manager struct {
        mu      sync.Mutex
        cgroups *cgroups.Cgroup
        paths   map[string]string
}

func NewManager(cg *cgroups.Cgroup, paths map[string]string) (*Manager, error) {
        // Some v1 controllers (cpu, cpuset, and devices) expect
        // cgroups.Resources to not be nil in Apply.
        if cg.Resources == nil {
                return nil, errors.New("cgroup v1 manager needs cgroups.Resources to be set during manager creation")
        }
        if cg.Resources.Unified != nil {
                return nil, cgroups.ErrV1NoUnified
        }

        if paths == nil {
                var err error
                paths, err = initPaths(cg)
                if err != nil {
                        return nil, err
                }
        }

        return &Manager{
                cgroups: cg,
                paths:   paths,
        }, nil
}

// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
        // We do not ignore errors if we are root.
        if !rootless {
                return false
        }
        // Is it an ordinary EPERM?
        if errors.Is(err, os.ErrPermission) {
                return true
        }
        // Handle some specific syscall errors.
        var errno unix.Errno
        if errors.As(err, &errno) {
                return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
        }
        return false
}

func (m *Manager) Apply(pid int) (retErr error) {
        m.mu.Lock()
        defer m.mu.Unlock()

        c := m.cgroups

        for _, sys := range subsystems {
                name := sys.Name()
                p, ok := m.paths[name]
                if !ok {
                        continue
                }

                if err := sys.Apply(p, c.Resources, pid); err != nil {
                        // In the case of rootless (including euid=0 in userns), where an
                        // explicit cgroup path hasn't been set, we don't bail on error in
                        // case of permission problems here, but do delete the path from
                        // the m.paths map, since it is either non-existent and could not
                        // be created, or the pid could not be added to it.
                        //
                        // Cases where limits for the subsystem have been set are handled
                        // later by Set, which fails with a friendly error (see
                        // if path == "" in Set).
                        if isIgnorableError(c.Rootless, err) && c.Path == "" {
                                retErr = cgroups.ErrRootless
                                delete(m.paths, name)
                                continue
                        }
                        return err
                }

        }
        return retErr
}

func (m *Manager) Destroy() error {
        m.mu.Lock()
        defer m.mu.Unlock()
        return cgroups.RemovePaths(m.paths)
}

func (m *Manager) Path(subsys string) string {
        m.mu.Lock()
        defer m.mu.Unlock()
        return m.paths[subsys]
}

func (m *Manager) GetStats() (*cgroups.Stats, error) {
        m.mu.Lock()
        defer m.mu.Unlock()
        stats := cgroups.NewStats()
        for _, sys := range subsystems {
                path := m.paths[sys.Name()]
                if path == "" {
                        continue
                }
                if err := sys.GetStats(path, stats); err != nil {
                        return nil, err
                }
        }
        return stats, nil
}

func (m *Manager) Set(r *cgroups.Resources) error {
        if r == nil {
                return nil
        }

        if r.Unified != nil {
                return cgroups.ErrV1NoUnified
        }

        m.mu.Lock()
        defer m.mu.Unlock()
        for _, sys := range subsystems {
                path := m.paths[sys.Name()]
                if err := sys.Set(path, r); err != nil {
                        // When rootless is true, errors from the device subsystem
                        // are ignored, as it is really not expected to work.
                        if m.cgroups.Rootless && sys.Name() == "devices" && !errors.Is(err, cgroups.ErrDevicesUnsupported) {
                                continue
                        }
                        // However, errors from other subsystems are not ignored.
                        // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
                        if path == "" {
                                // We never created a path for this cgroup, so we cannot set
                                // limits for it (though we have already tried at this point).
                                return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
                        }
                        return err
                }
        }

        return nil
}

// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *Manager) Freeze(state cgroups.FreezerState) error {
        path := m.Path("freezer")
        if path == "" {
                return errors.New("cannot toggle freezer: cgroups not configured for container")
        }

        prevState := m.cgroups.Resources.Freezer
        m.cgroups.Resources.Freezer = state
        freezer := &FreezerGroup{}
        if err := freezer.Set(path, m.cgroups.Resources); err != nil {
                m.cgroups.Resources.Freezer = prevState
                return err
        }
        return nil
}

func (m *Manager) GetPids() ([]int, error) {
        return cgroups.GetPids(m.Path("devices"))
}

func (m *Manager) GetAllPids() ([]int, error) {
        return cgroups.GetAllPids(m.Path("devices"))
}

func (m *Manager) GetPaths() map[string]string {
        m.mu.Lock()
        defer m.mu.Unlock()
        return m.paths
}

func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) {
        return m.cgroups, nil
}

func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) {
        dir := m.Path("freezer")
        // If the container doesn't have the freezer cgroup, say it's undefined.
        if dir == "" {
                return cgroups.Undefined, nil
        }
        freezer := &FreezerGroup{}
        return freezer.GetState(dir)
}

func (m *Manager) Exists() bool {
        return cgroups.PathExists(m.Path("devices"))
}

func OOMKillCount(path string) (uint64, error) {
        return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}

func (m *Manager) OOMKillCount() (uint64, error) {
        c, err := OOMKillCount(m.Path("memory"))
        // Ignore ENOENT when rootless as it couldn't create cgroup.
        if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
                err = nil
        }

        return c, err
}

package fs

import (
        "errors"
        "os"
        "strconv"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

type HugetlbGroup struct{}

func (s *HugetlbGroup) Name() string {
        return "hugetlb"
}

func (s *HugetlbGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *HugetlbGroup) Set(path string, r *cgroups.Resources) error {
        const suffix = ".limit_in_bytes"
        skipRsvd := false

        for _, hugetlb := range r.HugetlbLimit {
                prefix := "hugetlb." + hugetlb.Pagesize
                val := strconv.FormatUint(hugetlb.Limit, 10)
                if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil {
                        return err
                }
                if skipRsvd {
                        continue
                }
                if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil {
                        if errors.Is(err, os.ErrNotExist) {
                                skipRsvd = true
                                continue
                        }
                        return err
                }
        }

        return nil
}

func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
        if !cgroups.PathExists(path) {
                return nil
        }
        rsvd := ".rsvd"
        hugetlbStats := cgroups.HugetlbStats{}
        for _, pageSize := range cgroups.HugePageSizes() {
        again:
                prefix := "hugetlb." + pageSize + rsvd

                value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes")
                if err != nil {
                        if rsvd != "" && errors.Is(err, os.ErrNotExist) {
                                rsvd = ""
                                goto again
                        }
                        return err
                }
                hugetlbStats.Usage = value

                value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes")
                if err != nil {
                        return err
                }
                hugetlbStats.MaxUsage = value

                value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt")
                if err != nil {
                        return err
                }
                hugetlbStats.Failcnt = value

                stats.HugetlbStats[pageSize] = hugetlbStats
        }

        return nil
}

package fs

import (
        "bufio"
        "errors"
        "fmt"
        "math"
        "os"
        "path/filepath"
        "strconv"
        "strings"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

const (
        cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
        cgroupMemoryLimit     = "memory.limit_in_bytes"
        cgroupMemoryUsage     = "memory.usage_in_bytes"
        cgroupMemoryMaxUsage  = "memory.max_usage_in_bytes"
)

type MemoryGroup struct{}

func (s *MemoryGroup) Name() string {
        return "memory"
}

func (s *MemoryGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func setMemory(path string, val int64) error {
        if val == 0 {
                return nil
        }

        err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
        if !errors.Is(err, unix.EBUSY) {
                return err
        }

        // EBUSY means the kernel can't set new limit as it's too low
        // (lower than the current usage). Return more specific error.
        usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
        if err != nil {
                return err
        }
        max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
        if err != nil {
                return err
        }

        return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}

func setSwap(path string, val int64) error {
        if val == 0 {
                return nil
        }

        return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}

func setMemoryAndSwap(path string, r *cgroups.Resources) error {
        // If the memory update is set to -1 and the swap is not explicitly
        // set, we should also set swap to -1, it means unlimited memory.
        if r.Memory == -1 && r.MemorySwap == 0 {
                // Only set swap if it's enabled in kernel
                if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
                        r.MemorySwap = -1
                }
        }

        // When memory and swap memory are both set, we need to handle the cases
        // for updating container.
        if r.Memory != 0 && r.MemorySwap != 0 {
                curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
                if err != nil {
                        return err
                }

                // When update memory limit, we should adapt the write sequence
                // for memory and swap memory, so it won't fail because the new
                // value and the old value don't fit kernel's validation.
                if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
                        if err := setSwap(path, r.MemorySwap); err != nil {
                                return err
                        }
                        if err := setMemory(path, r.Memory); err != nil {
                                return err
                        }
                        return nil
                }
        }

        if err := setMemory(path, r.Memory); err != nil {
                return err
        }
        if err := setSwap(path, r.MemorySwap); err != nil {
                return err
        }

        return nil
}

func (s *MemoryGroup) Set(path string, r *cgroups.Resources) error {
        if err := setMemoryAndSwap(path, r); err != nil {
                return err
        }

        // ignore KernelMemory and KernelMemoryTCP

        if r.MemoryReservation != 0 {
                if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
                        return err
                }
        }

        if r.OomKillDisable {
                if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
                        return err
                }
        }
        if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
                return nil
        } else if *r.MemorySwappiness <= 100 {
                if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
                        return err
                }
        } else {
                return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
        }

        return nil
}

func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
        const file = "memory.stat"
        statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
        if err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return err
        }
        defer statsFile.Close()

        sc := bufio.NewScanner(statsFile)
        for sc.Scan() {
                t, v, err := fscommon.ParseKeyValue(sc.Text())
                if err != nil {
                        return &parseError{Path: path, File: file, Err: err}
                }
                stats.MemoryStats.Stats[t] = v
        }
        stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]

        memoryUsage, err := getMemoryData(path, "")
        if err != nil {
                return err
        }
        stats.MemoryStats.Usage = memoryUsage
        swapUsage, err := getMemoryData(path, "memsw")
        if err != nil {
                return err
        }
        stats.MemoryStats.SwapUsage = swapUsage
        stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
                Usage:   swapUsage.Usage - memoryUsage.Usage,
                Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
        }
        kernelUsage, err := getMemoryData(path, "kmem")
        if err != nil {
                return err
        }
        stats.MemoryStats.KernelUsage = kernelUsage
        kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
        if err != nil {
                return err
        }
        stats.MemoryStats.KernelTCPUsage = kernelTCPUsage

        value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
        if err != nil {
                return err
        }
        if value == 1 {
                stats.MemoryStats.UseHierarchy = true
        }

        pagesByNUMA, err := getPageUsageByNUMA(path)
        if err != nil {
                return err
        }
        stats.MemoryStats.PageUsageByNUMA = pagesByNUMA

        return nil
}

func getMemoryData(path, name string) (cgroups.MemoryData, error) {
        memoryData := cgroups.MemoryData{}

        moduleName := "memory"
        if name != "" {
                moduleName = "memory." + name
        }
        var (
                usage    = moduleName + ".usage_in_bytes"
                maxUsage = moduleName + ".max_usage_in_bytes"
                failcnt  = moduleName + ".failcnt"
                limit    = moduleName + ".limit_in_bytes"
        )

        value, err := fscommon.GetCgroupParamUint(path, usage)
        if err != nil {
                if name != "" && os.IsNotExist(err) {
                        // Ignore ENOENT as swap and kmem controllers
                        // are optional in the kernel.
                        return cgroups.MemoryData{}, nil
                }
                return cgroups.MemoryData{}, err
        }
        memoryData.Usage = value
        value, err = fscommon.GetCgroupParamUint(path, maxUsage)
        if err != nil {
                return cgroups.MemoryData{}, err
        }
        memoryData.MaxUsage = value
        value, err = fscommon.GetCgroupParamUint(path, failcnt)
        if err != nil {
                return cgroups.MemoryData{}, err
        }
        memoryData.Failcnt = value
        value, err = fscommon.GetCgroupParamUint(path, limit)
        if err != nil {
                if name == "kmem" && os.IsNotExist(err) {
                        // Ignore ENOENT as kmem.limit_in_bytes has
                        // been removed in newer kernels.
                        return memoryData, nil
                }

                return cgroups.MemoryData{}, err
        }
        memoryData.Limit = value

        return memoryData, nil
}

func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
        const (
                maxColumns = math.MaxUint8 + 1
                file       = "memory.numa_stat"
        )
        stats := cgroups.PageUsageByNUMA{}

        fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
        if os.IsNotExist(err) {
                return stats, nil
        } else if err != nil {
                return stats, err
        }
        defer fd.Close()

        // File format is documented in linux/Documentation/cgroup-v1/memory.txt
        // and it looks like this:
        //
        // total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
        // file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
        // anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
        // unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
        // hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...

        scanner := bufio.NewScanner(fd)
        for scanner.Scan() {
                var field *cgroups.PageStats

                line := scanner.Text()
                columns := strings.SplitN(line, " ", maxColumns)
                for i, column := range columns {
                        key, val, ok := strings.Cut(column, "=")
                        // Some custom kernels have non-standard fields, like
                        //   numa_locality 0 0 0 0 0 0 0 0 0 0
                        //   numa_exectime 0
                        if !ok {
                                if i == 0 {
                                        // Ignore/skip those.
                                        break
                                } else {
                                        // The first column was already validated,
                                        // so be strict to the rest.
                                        return stats, malformedLine(path, file, line)
                                }
                        }
                        if i == 0 { // First column: key is name, val is total.
                                field = getNUMAField(&stats, key)
                                if field == nil { // unknown field (new kernel?)
                                        break
                                }
                                field.Total, err = strconv.ParseUint(val, 0, 64)
                                if err != nil {
                                        return stats, &parseError{Path: path, File: file, Err: err}
                                }
                                field.Nodes = map[uint8]uint64{}
                        } else { // Subsequent columns: key is N<id>, val is usage.
                                if len(key) < 2 || key[0] != 'N' {
                                        // This is definitely an error.
                                        return stats, malformedLine(path, file, line)
                                }

                                n, err := strconv.ParseUint(key[1:], 10, 8)
                                if err != nil {
                                        return stats, &parseError{Path: path, File: file, Err: err}
                                }

                                usage, err := strconv.ParseUint(val, 10, 64)
                                if err != nil {
                                        return stats, &parseError{Path: path, File: file, Err: err}
                                }

                                field.Nodes[uint8(n)] = usage
                        }

                }
        }
        if err := scanner.Err(); err != nil {
                return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
        }

        return stats, nil
}

func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
        switch name {
        case "total":
                return &stats.Total
        case "file":
                return &stats.File
        case "anon":
                return &stats.Anon
        case "unevictable":
                return &stats.Unevictable
        case "hierarchical_total":
                return &stats.Hierarchical.Total
        case "hierarchical_file":
                return &stats.Hierarchical.File
        case "hierarchical_anon":
                return &stats.Hierarchical.Anon
        case "hierarchical_unevictable":
                return &stats.Hierarchical.Unevictable
        }
        return nil
}

package fs

import (
        "github.com/opencontainers/runc/libcontainer/cgroups"
)

type NameGroup struct {
        GroupName string
        Join      bool
}

func (s *NameGroup) Name() string {
        return s.GroupName
}

func (s *NameGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        if s.Join {
                // Ignore errors if the named cgroup does not exist.
                _ = apply(path, pid)
        }
        return nil
}

func (s *NameGroup) Set(_ string, _ *cgroups.Resources) error {
        return nil
}

func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
        return nil
}

package fs

import (
        "strconv"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

type NetClsGroup struct{}

func (s *NetClsGroup) Name() string {
        return "net_cls"
}

func (s *NetClsGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *NetClsGroup) Set(path string, r *cgroups.Resources) error {
        if r.NetClsClassid != 0 {
                if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
                        return err
                }
        }

        return nil
}

func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
        return nil
}

package fs

import (
        "github.com/opencontainers/runc/libcontainer/cgroups"
)

type NetPrioGroup struct{}

func (s *NetPrioGroup) Name() string {
        return "net_prio"
}

func (s *NetPrioGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *NetPrioGroup) Set(path string, r *cgroups.Resources) error {
        for _, prioMap := range r.NetPrioIfpriomap {
                if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
                        return err
                }
        }

        return nil
}

func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
        return nil
}

package fs

import (
        "errors"
        "os"
        "path/filepath"
        "sync"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/internal/path"
)

// The absolute path to the root of the cgroup hierarchies.
var (
        cgroupRootLock sync.Mutex
        cgroupRoot     string
)

const defaultCgroupRoot = "/sys/fs/cgroup"

func initPaths(cg *cgroups.Cgroup) (map[string]string, error) {
        root, err := rootPath()
        if err != nil {
                return nil, err
        }

        inner, err := path.Inner(cg)
        if err != nil {
                return nil, err
        }

        paths := make(map[string]string)
        for _, sys := range subsystems {
                name := sys.Name()
                path, err := subsysPath(root, inner, name)
                if err != nil {
                        // The non-presence of the devices subsystem
                        // is considered fatal for security reasons.
                        if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") {
                                continue
                        }

                        return nil, err
                }
                paths[name] = path
        }

        return paths, nil
}

func tryDefaultCgroupRoot() string {
        var st, pst unix.Stat_t

        // (1) it should be a directory...
        err := unix.Lstat(defaultCgroupRoot, &st)
        if err != nil || st.Mode&unix.S_IFDIR == 0 {
                return ""
        }

        // (2) ... and a mount point ...
        err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
        if err != nil {
                return ""
        }

        if st.Dev == pst.Dev {
                // parent dir has the same dev -- not a mount point
                return ""
        }

        // (3) ... of 'tmpfs' fs type.
        var fst unix.Statfs_t
        err = unix.Statfs(defaultCgroupRoot, &fst)
        if err != nil || fst.Type != unix.TMPFS_MAGIC {
                return ""
        }

        // (4) it should have at least 1 entry ...
        dir, err := os.Open(defaultCgroupRoot)
        if err != nil {
                return ""
        }
        defer dir.Close()
        names, err := dir.Readdirnames(1)
        if err != nil {
                return ""
        }
        if len(names) < 1 {
                return ""
        }
        // ... which is a cgroup mount point.
        err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
        if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
                return ""
        }

        return defaultCgroupRoot
}

// rootPath finds and returns path to the root of the cgroup hierarchies.
func rootPath() (string, error) {
        cgroupRootLock.Lock()
        defer cgroupRootLock.Unlock()

        if cgroupRoot != "" {
                return cgroupRoot, nil
        }

        // fast path
        cgroupRoot = tryDefaultCgroupRoot()
        if cgroupRoot != "" {
                return cgroupRoot, nil
        }

        // slow path: parse mountinfo
        mi, err := cgroups.GetCgroupMounts(false)
        if err != nil {
                return "", err
        }
        if len(mi) < 1 {
                return "", errors.New("no cgroup mount found in mountinfo")
        }

        // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
        // use its parent directory.
        root := filepath.Dir(mi[0].Mountpoint)

        if _, err := os.Stat(root); err != nil {
                return "", err
        }

        cgroupRoot = root
        return cgroupRoot, nil
}

func subsysPath(root, inner, subsystem string) (string, error) {
        // If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
        if filepath.IsAbs(inner) {
                mnt, err := cgroups.FindCgroupMountpoint(root, subsystem)
                // If we didn't mount the subsystem, there is no point we make the path.
                if err != nil {
                        return "", err
                }

                // Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
                return filepath.Join(root, filepath.Base(mnt), inner), nil
        }

        // Use GetOwnCgroupPath for dind-like cases, when cgroupns is not
        // available. This is ugly.
        parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
        if err != nil {
                return "", err
        }

        return filepath.Join(parentPath, inner), nil
}

func apply(path string, pid int) error {
        if path == "" {
                return nil
        }
        if err := os.MkdirAll(path, 0o755); err != nil {
                return err
        }
        return cgroups.WriteCgroupProc(path, pid)
}

package fs

import (
        "github.com/opencontainers/runc/libcontainer/cgroups"
)

type PerfEventGroup struct{}

func (s *PerfEventGroup) Name() string {
        return "perf_event"
}

func (s *PerfEventGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *PerfEventGroup) Set(_ string, _ *cgroups.Resources) error {
        return nil
}

func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
        return nil
}

package fs

import (
        "math"
        "strconv"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

type PidsGroup struct{}

func (s *PidsGroup) Name() string {
        return "pids"
}

func (s *PidsGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *PidsGroup) Set(path string, r *cgroups.Resources) error {
        if r.PidsLimit != 0 {
                // "max" is the fallback value.
                limit := "max"

                if r.PidsLimit > 0 {
                        limit = strconv.FormatInt(r.PidsLimit, 10)
                }

                if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
                        return err
                }
        }

        return nil
}

func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
        if !cgroups.PathExists(path) {
                return nil
        }
        current, err := fscommon.GetCgroupParamUint(path, "pids.current")
        if err != nil {
                return err
        }

        max, err := fscommon.GetCgroupParamUint(path, "pids.max")
        if err != nil {
                return err
        }
        // If no limit is set, read from pids.max returns "max", which is
        // converted to MaxUint64 by GetCgroupParamUint. Historically, we
        // represent "no limit" for pids as 0, thus this conversion.
        if max == math.MaxUint64 {
                max = 0
        }

        stats.PidsStats.Current = current
        stats.PidsStats.Limit = max
        return nil
}

package fs

import (
        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

type RdmaGroup struct{}

func (s *RdmaGroup) Name() string {
        return "rdma"
}

func (s *RdmaGroup) Apply(path string, _ *cgroups.Resources, pid int) error {
        return apply(path, pid)
}

func (s *RdmaGroup) Set(path string, r *cgroups.Resources) error {
        return fscommon.RdmaSet(path, r)
}

func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error {
        return fscommon.RdmaGetStats(path, stats)
}

package fs2

import (
        "bufio"
        "errors"
        "os"
        "strconv"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

func isCPUSet(r *cgroups.Resources) bool {
        return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 || r.CPUIdle != nil || r.CpuBurst != nil
}

func setCPU(dirPath string, r *cgroups.Resources) error {
        if !isCPUSet(r) {
                return nil
        }

        if r.CPUIdle != nil {
                if err := cgroups.WriteFile(dirPath, "cpu.idle", strconv.FormatInt(*r.CPUIdle, 10)); err != nil {
                        return err
                }
        }

        // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
        if r.CpuWeight != 0 {
                if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
                        return err
                }
        }

        var burst string
        if r.CpuBurst != nil {
                burst = strconv.FormatUint(*r.CpuBurst, 10)
                if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
                        // Sometimes when the burst to be set is larger
                        // than the current one, it is rejected by the kernel
                        // (EINVAL) as old_quota/new_burst exceeds the parent
                        // cgroup quota limit. If this happens and the quota is
                        // going to be set, ignore the error for now and retry
                        // after setting the quota.
                        if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
                                return err
                        }
                } else {
                        burst = ""
                }
        }
        if r.CpuQuota != 0 || r.CpuPeriod != 0 {
                str := "max"
                if r.CpuQuota > 0 {
                        str = strconv.FormatInt(r.CpuQuota, 10)
                }
                period := r.CpuPeriod
                if period == 0 {
                        // This default value is documented in
                        // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
                        period = 100000
                }
                str += " " + strconv.FormatUint(period, 10)
                if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil {
                        return err
                }
                if burst != "" {
                        if err := cgroups.WriteFile(dirPath, "cpu.max.burst", burst); err != nil {
                                return err
                        }
                }
        }

        return nil
}

func statCpu(dirPath string, stats *cgroups.Stats) error {
        const file = "cpu.stat"
        f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
        if err != nil {
                return err
        }
        defer f.Close()

        sc := bufio.NewScanner(f)
        for sc.Scan() {
                t, v, err := fscommon.ParseKeyValue(sc.Text())
                if err != nil {
                        return &parseError{Path: dirPath, File: file, Err: err}
                }
                switch t {
                case "usage_usec":
                        stats.CpuStats.CpuUsage.TotalUsage = v * 1000

                case "user_usec":
                        stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000

                case "system_usec":
                        stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000

                case "nr_periods":
                        stats.CpuStats.ThrottlingData.Periods = v

                case "nr_throttled":
                        stats.CpuStats.ThrottlingData.ThrottledPeriods = v

                case "throttled_usec":
                        stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000
                }
        }
        if err := sc.Err(); err != nil {
                return &parseError{Path: dirPath, File: file, Err: err}
        }
        return nil
}

package fs2

import (
        "github.com/opencontainers/runc/libcontainer/cgroups"
)

func isCpusetSet(r *cgroups.Resources) bool {
        return r.CpusetCpus != "" || r.CpusetMems != ""
}

func setCpuset(dirPath string, r *cgroups.Resources) error {
        if !isCpusetSet(r) {
                return nil
        }

        if r.CpusetCpus != "" {
                if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
                        return err
                }
        }
        if r.CpusetMems != "" {
                if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
                        return err
                }
        }
        return nil
}

package fs2

import (
        "fmt"
        "os"
        "path/filepath"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

func supportedControllers() (string, error) {
        return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
}

// needAnyControllers returns whether we enable some supported controllers or not,
// based on (1) controllers available and (2) resources that are being set.
// We don't check "pseudo" controllers such as
// "freezer" and "devices".
func needAnyControllers(r *cgroups.Resources) (bool, error) {
        if r == nil {
                return false, nil
        }

        // list of all available controllers
        content, err := supportedControllers()
        if err != nil {
                return false, err
        }
        avail := make(map[string]struct{})
        for _, ctr := range strings.Fields(content) {
                avail[ctr] = struct{}{}
        }

        // check whether the controller if available or not
        have := func(controller string) bool {
                _, ok := avail[controller]
                return ok
        }

        if isPidsSet(r) && have("pids") {
                return true, nil
        }
        if isMemorySet(r) && have("memory") {
                return true, nil
        }
        if isIoSet(r) && have("io") {
                return true, nil
        }
        if isCPUSet(r) && have("cpu") {
                return true, nil
        }
        if isCpusetSet(r) && have("cpuset") {
                return true, nil
        }
        if isHugeTlbSet(r) && have("hugetlb") {
                return true, nil
        }

        return false, nil
}

// containsDomainController returns whether the current config contains domain controller or not.
// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html
// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids.
func containsDomainController(r *cgroups.Resources) bool {
        return isMemorySet(r) || isIoSet(r) || isCPUSet(r) || isHugeTlbSet(r)
}

// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers.
func CreateCgroupPath(path string, c *cgroups.Cgroup) (Err error) {
        if !strings.HasPrefix(path, UnifiedMountpoint) {
                return fmt.Errorf("invalid cgroup path %s", path)
        }

        content, err := supportedControllers()
        if err != nil {
                return err
        }

        const (
                cgTypeFile  = "cgroup.type"
                cgStCtlFile = "cgroup.subtree_control"
        )
        ctrs := strings.Fields(content)
        res := "+" + strings.Join(ctrs, " +")

        elements := strings.Split(path, "/")
        elements = elements[3:]
        current := "/sys/fs"
        for i, e := range elements {
                current = filepath.Join(current, e)
                if i > 0 {
                        if err := os.Mkdir(current, 0o755); err != nil {
                                if !os.IsExist(err) {
                                        return err
                                }
                        } else {
                                // If the directory was created, be sure it is not left around on errors.
                                current := current
                                defer func() {
                                        if Err != nil {
                                                os.Remove(current)
                                        }
                                }()
                        }
                        cgType, _ := cgroups.ReadFile(current, cgTypeFile)
                        cgType = strings.TrimSpace(cgType)
                        switch cgType {
                        // If the cgroup is in an invalid mode (usually this means there's an internal
                        // process in the cgroup tree, because we created a cgroup under an
                        // already-populated-by-other-processes cgroup), then we have to error out if
                        // the user requested controllers which are not thread-aware. However, if all
                        // the controllers requested are thread-aware we can simply put the cgroup into
                        // threaded mode.
                        case "domain invalid":
                                if containsDomainController(c.Resources) {
                                        return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current)
                                } else {
                                        // Not entirely correct (in theory we'd always want to be a domain --
                                        // since that means we're a properly delegated cgroup subtree) but in
                                        // this case there's not much we can do and it's better than giving an
                                        // error.
                                        _ = cgroups.WriteFile(current, cgTypeFile, "threaded")
                                }
                        // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
                        // (and you cannot usually take a cgroup out of threaded mode).
                        case "domain threaded":
                                fallthrough
                        case "threaded":
                                if containsDomainController(c.Resources) {
                                        return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
                                }
                        }
                }
                // enable all supported controllers
                if i < len(elements)-1 {
                        if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil {
                                // try write one by one
                                allCtrs := strings.Split(res, " ")
                                for _, ctr := range allCtrs {
                                        _ = cgroups.WriteFile(current, cgStCtlFile, ctr)
                                }
                        }
                        // Some controllers might not be enabled when rootless or containerized,
                        // but we don't catch the error here. (Caught in setXXX() functions.)
                }
        }

        return nil
}

/*
   Copyright The containerd Authors.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

package fs2

import (
        "bufio"
        "errors"
        "io"
        "os"
        "path/filepath"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/internal/path"
)

const UnifiedMountpoint = "/sys/fs/cgroup"

func defaultDirPath(c *cgroups.Cgroup) (string, error) {
        innerPath, err := path.Inner(c)
        if err != nil {
                return "", err
        }

        if filepath.IsAbs(innerPath) {
                return filepath.Join(UnifiedMountpoint, innerPath), nil
        }

        // we don't need to use /proc/thread-self here because runc always runs
        // with every thread in the same cgroup. This lets us avoid having to do
        // runtime.LockOSThread.
        ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
        if err != nil {
                return "", err
        }
        // The current user scope most probably has tasks in it already,
        // making it impossible to enable controllers for its sub-cgroup.
        // A parent cgroup (with no tasks in it) is what we need.
        ownCgroup = filepath.Dir(ownCgroup)

        return filepath.Join(UnifiedMountpoint, ownCgroup, innerPath), nil
}

// parseCgroupFile parses /proc/PID/cgroup file and return string
func parseCgroupFile(path string) (string, error) {
        f, err := os.Open(path)
        if err != nil {
                return "", err
        }
        defer f.Close()
        return parseCgroupFromReader(f)
}

func parseCgroupFromReader(r io.Reader) (string, error) {
        s := bufio.NewScanner(r)
        for s.Scan() {
                // "0::/user.slice/user-1001.slice/session-1.scope"
                if path, ok := strings.CutPrefix(s.Text(), "0::"); ok {
                        return path, nil
                }
        }
        if err := s.Err(); err != nil {
                return "", err
        }
        return "", errors.New("cgroup path not found")
}

package fs2

import (
        "bufio"
        "errors"
        "fmt"
        "os"
        "strings"
        "time"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

func setFreezer(dirPath string, state cgroups.FreezerState) error {
        var stateStr string
        switch state {
        case cgroups.Undefined:
                return nil
        case cgroups.Frozen:
                stateStr = "1"
        case cgroups.Thawed:
                stateStr = "0"
        default:
                return fmt.Errorf("invalid freezer state %q requested", state)
        }

        fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR)
        if err != nil {
                // We can ignore this request as long as the user didn't ask us to
                // freeze the container (since without the freezer cgroup, that's a
                // no-op).
                if state != cgroups.Frozen {
                        return nil
                }
                return fmt.Errorf("freezer not supported: %w", err)
        }
        defer fd.Close()

        if _, err := fd.WriteString(stateStr); err != nil {
                return err
        }
        // Confirm that the cgroup did actually change states.
        if actualState, err := readFreezer(dirPath, fd); err != nil {
                return err
        } else if actualState != state {
                return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
        }
        return nil
}

func getFreezer(dirPath string) (cgroups.FreezerState, error) {
        fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY)
        if err != nil {
                // If the kernel is too old, then we just treat the freezer as being in
                // an "undefined" state.
                if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
                        err = nil
                }
                return cgroups.Undefined, err
        }
        defer fd.Close()

        return readFreezer(dirPath, fd)
}

func readFreezer(dirPath string, fd *os.File) (cgroups.FreezerState, error) {
        if _, err := fd.Seek(0, 0); err != nil {
                return cgroups.Undefined, err
        }
        state := make([]byte, 2)
        if _, err := fd.Read(state); err != nil {
                return cgroups.Undefined, err
        }
        switch string(state) {
        case "0\n":
                return cgroups.Thawed, nil
        case "1\n":
                return waitFrozen(dirPath)
        default:
                return cgroups.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state)
        }
}

// waitFrozen polls cgroup.events until it sees "frozen 1" in it.
func waitFrozen(dirPath string) (cgroups.FreezerState, error) {
        fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY)
        if err != nil {
                return cgroups.Undefined, err
        }
        defer fd.Close()

        // XXX: Simple wait/read/retry is used here. An implementation
        // based on poll(2) or inotify(7) is possible, but it makes the code
        // much more complicated. Maybe address this later.
        const (
                // Perform maxIter with waitTime in between iterations.
                waitTime = 10 * time.Millisecond
                maxIter  = 1000
        )
        scanner := bufio.NewScanner(fd)
        for i := 0; scanner.Scan(); {
                if i == maxIter {
                        return cgroups.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter)
                }
                if val, ok := strings.CutPrefix(scanner.Text(), "frozen "); ok {
                        if val[0] == '1' {
                                return cgroups.Frozen, nil
                        }

                        i++
                        // wait, then re-read
                        time.Sleep(waitTime)
                        _, err := fd.Seek(0, 0)
                        if err != nil {
                                return cgroups.Undefined, err
                        }
                }
        }
        // Should only reach here either on read error,
        // or if the file does not contain "frozen " line.
        return cgroups.Undefined, scanner.Err()
}

package fs2

import (
        "errors"
        "fmt"
        "os"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

type parseError = fscommon.ParseError

type Manager struct {
        config *cgroups.Cgroup
        // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
        dirPath string
        // controllers is content of "cgroup.controllers" file.
        // excludes pseudo-controllers ("devices" and "freezer").
        controllers map[string]struct{}
}

// NewManager creates a manager for cgroup v2 unified hierarchy.
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
// If dirPath is empty, it is automatically set using config.
func NewManager(config *cgroups.Cgroup, dirPath string) (*Manager, error) {
        if dirPath == "" {
                var err error
                dirPath, err = defaultDirPath(config)
                if err != nil {
                        return nil, err
                }
        }

        m := &Manager{
                config:  config,
                dirPath: dirPath,
        }
        return m, nil
}

func (m *Manager) getControllers() error {
        if m.controllers != nil {
                return nil
        }

        data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers")
        if err != nil {
                if m.config.Rootless && m.config.Path == "" {
                        return nil
                }
                return err
        }
        fields := strings.Fields(data)
        m.controllers = make(map[string]struct{}, len(fields))
        for _, c := range fields {
                m.controllers[c] = struct{}{}
        }

        return nil
}

func (m *Manager) Apply(pid int) error {
        if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
                // Related tests:
                // - "runc create (no limits + no cgrouppath + no permission) succeeds"
                // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error"
                // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
                if m.config.Rootless {
                        if m.config.Path == "" {
                                if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
                                        return cgroups.ErrRootless
                                }
                                return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err)
                        }
                }
                return err
        }
        if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil {
                return err
        }
        return nil
}

func (m *Manager) GetPids() ([]int, error) {
        return cgroups.GetPids(m.dirPath)
}

func (m *Manager) GetAllPids() ([]int, error) {
        return cgroups.GetAllPids(m.dirPath)
}

func (m *Manager) GetStats() (*cgroups.Stats, error) {
        var errs []error

        st := cgroups.NewStats()

        // pids (since kernel 4.5)
        if err := statPids(m.dirPath, st); err != nil {
                errs = append(errs, err)
        }
        // memory (since kernel 4.5)
        if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) {
                errs = append(errs, err)
        }
        // io (since kernel 4.5)
        if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) {
                errs = append(errs, err)
        }
        // cpu (since kernel 4.15)
        // Note cpu.stat is available even if the controller is not enabled.
        if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
                errs = append(errs, err)
        }
        // PSI (since kernel 4.20).
        var err error
        if st.CpuStats.PSI, err = statPSI(m.dirPath, "cpu.pressure"); err != nil {
                errs = append(errs, err)
        }
        if st.MemoryStats.PSI, err = statPSI(m.dirPath, "memory.pressure"); err != nil {
                errs = append(errs, err)
        }
        if st.BlkioStats.PSI, err = statPSI(m.dirPath, "io.pressure"); err != nil {
                errs = append(errs, err)
        }
        // hugetlb (since kernel 5.6)
        if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
                errs = append(errs, err)
        }
        // rdma (since kernel 4.11)
        if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) {
                errs = append(errs, err)
        }
        // misc (since kernel 5.13)
        if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) {
                errs = append(errs, err)
        }
        if len(errs) > 0 && !m.config.Rootless {
                return st, fmt.Errorf("error while statting cgroup v2: %+v", errs)
        }
        return st, nil
}

func (m *Manager) Freeze(state cgroups.FreezerState) error {
        if m.config.Resources == nil {
                return errors.New("cannot toggle freezer: cgroups not configured for container")
        }
        if err := setFreezer(m.dirPath, state); err != nil {
                return err
        }
        m.config.Resources.Freezer = state
        return nil
}

func (m *Manager) Destroy() error {
        return cgroups.RemovePath(m.dirPath)
}

func (m *Manager) Path(_ string) string {
        return m.dirPath
}

func (m *Manager) Set(r *cgroups.Resources) error {
        if r == nil {
                return nil
        }
        if err := m.getControllers(); err != nil {
                return err
        }
        // pids (since kernel 4.5)
        if err := setPids(m.dirPath, r); err != nil {
                return err
        }
        // memory (since kernel 4.5)
        if err := setMemory(m.dirPath, r); err != nil {
                return err
        }
        // io (since kernel 4.5)
        if err := setIo(m.dirPath, r); err != nil {
                return err
        }
        // cpu (since kernel 4.15)
        if err := setCPU(m.dirPath, r); err != nil {
                return err
        }
        // devices (since kernel 4.15, pseudo-controller)
        //
        // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
        // However, errors from other subsystems are not ignored.
        // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
        if err := setDevices(m.dirPath, r); err != nil {
                if !m.config.Rootless || errors.Is(err, cgroups.ErrDevicesUnsupported) {
                        return err
                }
        }
        // cpuset (since kernel 5.0)
        if err := setCpuset(m.dirPath, r); err != nil {
                return err
        }
        // hugetlb (since kernel 5.6)
        if err := setHugeTlb(m.dirPath, r); err != nil {
                return err
        }
        // rdma (since kernel 4.11)
        if err := fscommon.RdmaSet(m.dirPath, r); err != nil {
                return err
        }
        // freezer (since kernel 5.2, pseudo-controller)
        if err := setFreezer(m.dirPath, r.Freezer); err != nil {
                return err
        }
        if err := m.setUnified(r.Unified); err != nil {
                return err
        }
        m.config.Resources = r
        return nil
}

func setDevices(dirPath string, r *cgroups.Resources) error {
        if cgroups.DevicesSetV2 == nil {
                if len(r.Devices) > 0 {
                        return cgroups.ErrDevicesUnsupported
                }
                return nil
        }
        return cgroups.DevicesSetV2(dirPath, r)
}

func (m *Manager) setUnified(res map[string]string) error {
        for k, v := range res {
                if strings.Contains(k, "/") {
                        return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
                }
                if err := cgroups.WriteFileByLine(m.dirPath, k, v); err != nil {
                        // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
                        if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) {
                                // Check if a controller is available,
                                // to give more specific error if not.
                                c, _, ok := strings.Cut(k, ".")
                                if !ok {
                                        return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
                                }
                                if _, ok := m.controllers[c]; !ok && c != "cgroup" {
                                        return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c)
                                }
                        }
                        return fmt.Errorf("unable to set unified resource %q: %w", k, err)
                }
        }

        return nil
}

func (m *Manager) GetPaths() map[string]string {
        paths := make(map[string]string, 1)
        paths[""] = m.dirPath
        return paths
}

func (m *Manager) GetCgroups() (*cgroups.Cgroup, error) {
        return m.config, nil
}

func (m *Manager) GetFreezerState() (cgroups.FreezerState, error) {
        return getFreezer(m.dirPath)
}

func (m *Manager) Exists() bool {
        return cgroups.PathExists(m.dirPath)
}

func OOMKillCount(path string) (uint64, error) {
        return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}

func (m *Manager) OOMKillCount() (uint64, error) {
        c, err := OOMKillCount(m.dirPath)
        if err != nil && m.config.Rootless && os.IsNotExist(err) {
                err = nil
        }

        return c, err
}

func CheckMemoryUsage(dirPath string, r *cgroups.Resources) error {
        if !r.MemoryCheckBeforeUpdate {
                return nil
        }

        if r.Memory <= 0 && r.MemorySwap <= 0 {
                return nil
        }

        usage, err := fscommon.GetCgroupParamUint(dirPath, "memory.current")
        if err != nil {
                // This check is on best-effort basis, so if we can't read the
                // current usage (cgroup not yet created, or any other error),
                // we should not fail.
                return nil
        }

        if r.MemorySwap > 0 {
                if uint64(r.MemorySwap) <= usage {
                        return fmt.Errorf("rejecting memory+swap limit %d <= usage %d", r.MemorySwap, usage)
                }
        }

        if r.Memory > 0 {
                if uint64(r.Memory) <= usage {
                        return fmt.Errorf("rejecting memory limit %d <= usage %d", r.Memory, usage)
                }
        }

        return nil
}

//go:build gofuzz
// +build gofuzz

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package fs2

import (
        "bytes"
        "errors"
        gofuzzheaders "github.com/AdaLogics/go-fuzz-headers"
        "github.com/opencontainers/runc/libcontainer/cgroups"
        "os"
)

func FuzzCgroupReader(data []byte) int {
        r := bytes.NewReader(data)
        _, _ = parseCgroupFromReader(r)
        return 1
}

func createFiles(files []string, cf *gofuzzheaders.ConsumeFuzzer) error {
        for i := 0; i < len(files); i++ {
                f, err := os.OpenFile(files[i], os.O_RDWR|os.O_CREATE, 0755)
                if err != nil {
                        return errors.New("Could not create file")
                }
                defer f.Close()
                //defer os.RemoveAll(files[i])
                b, err := cf.GetBytes()
                if err != nil {
                        return errors.New("Could not get bytes")
                }
                _, err = f.Write(b)
                if err != nil {
                        return errors.New("Could not write to file")
                }
        }
        return nil
}

func FuzzGetStats(data []byte) int {
        stats := cgroups.Stats{}
        f := gofuzzheaders.NewConsumer(data)
        err := f.GenerateStruct(&stats)
        if err != nil {
                return -1
        }

        // statPids:
        sPidsFiles := []string{"/tmp/pids.current",
                "/tmp/pids.max"}
        err = createFiles(sPidsFiles, f)
        if err != nil {
                return -1
        }
        defer os.RemoveAll("/tmp/pids.current")
        defer os.RemoveAll("/tmp/pids.max")
        _ = statPids("/tmp", &stats)

        // statMemory:
        stats3 := cgroups.Stats{}
        err = f.GenerateStruct(&stats3)
        if err != nil {
                return -1
        }
        sMemFiles := []string{"/tmp/memory.stat",
                "/tmp/memory.swap",
                "/tmp/memory.current",
                "/tmp/memory.max"}
        err = createFiles(sMemFiles, f)
        if err != nil {
                return -1
        }
        defer os.RemoveAll("/tmp/memory.stat")
        defer os.RemoveAll("/tmp/memory.swap")
        defer os.RemoveAll("/tmp/memory.current")
        defer os.RemoveAll("/tmp/memory.max")
        _ = statMemory("/tmp", &stats3)

        // StatIo:
        stats4 := cgroups.Stats{}
        err = f.GenerateStruct(&stats4)
        if err != nil {
                return -1
        }
        sIoFiles := []string{"/tmp/io.stat"}
        err = createFiles(sIoFiles, f)
        if err != nil {
                return -1
        }
        defer os.RemoveAll("/tmp/io.stat")
        _ = statIo("/tmp", &stats4)

        // statCpu:
        stats5 := cgroups.Stats{}
        err = f.GenerateStruct(&stats5)
        if err != nil {
                return -1
        }
        sCpuFiles := []string{"/tmp/cpu.stat"}
        err = createFiles(sCpuFiles, f)
        if err != nil {
                return -1
        }
        defer os.RemoveAll("/tmp/cpu.stat")
        _ = statCpu("/tmp", &stats5)
        return 1
}

package fs2

import (
        "errors"
        "os"
        "strconv"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

func isHugeTlbSet(r *cgroups.Resources) bool {
        return len(r.HugetlbLimit) > 0
}

func setHugeTlb(dirPath string, r *cgroups.Resources) error {
        if !isHugeTlbSet(r) {
                return nil
        }
        const suffix = ".max"
        skipRsvd := false
        for _, hugetlb := range r.HugetlbLimit {
                prefix := "hugetlb." + hugetlb.Pagesize
                val := strconv.FormatUint(hugetlb.Limit, 10)
                if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil {
                        return err
                }
                if skipRsvd {
                        continue
                }
                if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil {
                        if errors.Is(err, os.ErrNotExist) {
                                skipRsvd = true
                                continue
                        }
                        return err
                }
        }

        return nil
}

func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
        hugetlbStats := cgroups.HugetlbStats{}
        rsvd := ".rsvd"
        for _, pagesize := range cgroups.HugePageSizes() {
        again:
                prefix := "hugetlb." + pagesize + rsvd
                value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current")
                if err != nil {
                        if rsvd != "" && errors.Is(err, os.ErrNotExist) {
                                rsvd = ""
                                goto again
                        }
                        return err
                }
                hugetlbStats.Usage = value

                value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max")
                if err != nil {
                        return err
                }
                hugetlbStats.Failcnt = value

                stats.HugetlbStats[pagesize] = hugetlbStats
        }

        return nil
}

package fs2

import (
        "bufio"
        "bytes"
        "fmt"
        "os"
        "strconv"
        "strings"

        "github.com/sirupsen/logrus"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

func isIoSet(r *cgroups.Resources) bool {
        return r.BlkioWeight != 0 ||
                len(r.BlkioWeightDevice) > 0 ||
                len(r.BlkioThrottleReadBpsDevice) > 0 ||
                len(r.BlkioThrottleWriteBpsDevice) > 0 ||
                len(r.BlkioThrottleReadIOPSDevice) > 0 ||
                len(r.BlkioThrottleWriteIOPSDevice) > 0
}

// bfqDeviceWeightSupported checks for per-device BFQ weight support (added
// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight".
func bfqDeviceWeightSupported(bfq *os.File) bool {
        if bfq == nil {
                return false
        }
        _, _ = bfq.Seek(0, 0)
        buf := make([]byte, 32)
        _, _ = bfq.Read(buf)
        // If only a single number (default weight) if read back, we have older kernel.
        _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64)
        return err != nil
}

func setIo(dirPath string, r *cgroups.Resources) error {
        if !isIoSet(r) {
                return nil
        }

        // If BFQ IO scheduler is available, use it.
        var bfq *os.File
        if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 {
                var err error
                bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR)
                if err == nil {
                        defer bfq.Close()
                } else if !os.IsNotExist(err) {
                        return err
                }
        }

        if r.BlkioWeight != 0 {
                if bfq != nil { // Use BFQ.
                        if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
                                return err
                        }
                } else {
                        // Fallback to io.weight with a conversion scheme.
                        v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight)
                        if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
                                return err
                        }
                }
        }
        if bfqDeviceWeightSupported(bfq) {
                for _, wd := range r.BlkioWeightDevice {
                        if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil {
                                return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err)
                        }
                }
        }
        for _, td := range r.BlkioThrottleReadBpsDevice {
                if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
                        return err
                }
        }
        for _, td := range r.BlkioThrottleWriteBpsDevice {
                if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
                        return err
                }
        }
        for _, td := range r.BlkioThrottleReadIOPSDevice {
                if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
                        return err
                }
        }
        for _, td := range r.BlkioThrottleWriteIOPSDevice {
                if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
                        return err
                }
        }

        return nil
}

func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
        ret := map[string][]string{}
        f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY)
        if err != nil {
                return nil, err
        }
        defer f.Close()
        scanner := bufio.NewScanner(f)
        for scanner.Scan() {
                line := scanner.Text()
                parts := strings.Fields(line)
                if len(parts) < 2 {
                        continue
                }
                ret[parts[0]] = parts[1:]
        }
        if err := scanner.Err(); err != nil {
                return nil, &parseError{Path: dirPath, File: name, Err: err}
        }
        return ret, nil
}

func statIo(dirPath string, stats *cgroups.Stats) error {
        const file = "io.stat"
        values, err := readCgroup2MapFile(dirPath, file)
        if err != nil {
                return err
        }
        // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
        var parsedStats cgroups.BlkioStats
        for k, v := range values {
                d := strings.Split(k, ":")
                if len(d) != 2 {
                        continue
                }
                major, err := strconv.ParseUint(d[0], 10, 64)
                if err != nil {
                        return &parseError{Path: dirPath, File: file, Err: err}
                }
                minor, err := strconv.ParseUint(d[1], 10, 64)
                if err != nil {
                        return &parseError{Path: dirPath, File: file, Err: err}
                }

                for _, item := range v {
                        d := strings.Split(item, "=")
                        if len(d) != 2 {
                                continue
                        }
                        op := d[0]

                        // Map to the cgroupv1 naming and layout (in separate tables).
                        var targetTable *[]cgroups.BlkioStatEntry
                        switch op {
                        // Equivalent to cgroupv1's blkio.io_service_bytes.
                        case "rbytes":
                                op = "Read"
                                targetTable = &parsedStats.IoServiceBytesRecursive
                        case "wbytes":
                                op = "Write"
                                targetTable = &parsedStats.IoServiceBytesRecursive
                        // Equivalent to cgroupv1's blkio.io_serviced.
                        case "rios":
                                op = "Read"
                                targetTable = &parsedStats.IoServicedRecursive
                        case "wios":
                                op = "Write"
                                targetTable = &parsedStats.IoServicedRecursive
                        default:
                                // Skip over entries we cannot map to cgroupv1 stats for now.
                                // In the future we should expand the stats struct to include
                                // them.
                                logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item)
                                continue
                        }

                        value, err := strconv.ParseUint(d[1], 10, 64)
                        if err != nil {
                                return &parseError{Path: dirPath, File: file, Err: err}
                        }

                        entry := cgroups.BlkioStatEntry{
                                Op:    op,
                                Major: major,
                                Minor: minor,
                                Value: value,
                        }
                        *targetTable = append(*targetTable, entry)
                }
        }
        stats.BlkioStats = parsedStats
        return nil
}

package fs2

import (
        "bufio"
        "errors"
        "math"
        "os"
        "strconv"
        "strings"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

// numToStr converts an int64 value to a string for writing to a
// cgroupv2 files with .min, .max, .low, or .high suffix.
// The value of -1 is converted to "max" for cgroupv1 compatibility
// (which used to write -1 to remove the limit).
func numToStr(value int64) (ret string) {
        switch {
        case value == 0:
                ret = ""
        case value == -1:
                ret = "max"
        default:
                ret = strconv.FormatInt(value, 10)
        }

        return ret
}

func isMemorySet(r *cgroups.Resources) bool {
        return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0
}

func setMemory(dirPath string, r *cgroups.Resources) error {
        if !isMemorySet(r) {
                return nil
        }

        if err := CheckMemoryUsage(dirPath, r); err != nil {
                return err
        }

        swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
        if err != nil {
                return err
        }
        swapStr := numToStr(swap)
        if swapStr == "" && swap == 0 && r.MemorySwap > 0 {
                // memory and memorySwap set to the same value -- disable swap
                swapStr = "0"
        }
        // never write empty string to `memory.swap.max`, it means set to 0.
        if swapStr != "" {
                if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
                        // If swap is not enabled, silently ignore setting to max or disabling it.
                        if !(errors.Is(err, os.ErrNotExist) && (swapStr == "max" || swapStr == "0")) {
                                return err
                        }
                }
        }

        if val := numToStr(r.Memory); val != "" {
                if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil {
                        return err
                }
        }

        // cgroup.Resources.KernelMemory is ignored

        if val := numToStr(r.MemoryReservation); val != "" {
                if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil {
                        return err
                }
        }

        return nil
}

func statMemory(dirPath string, stats *cgroups.Stats) error {
        const file = "memory.stat"
        statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
        if err != nil {
                return err
        }
        defer statsFile.Close()

        sc := bufio.NewScanner(statsFile)
        for sc.Scan() {
                t, v, err := fscommon.ParseKeyValue(sc.Text())
                if err != nil {
                        return &parseError{Path: dirPath, File: file, Err: err}
                }
                stats.MemoryStats.Stats[t] = v
        }
        if err := sc.Err(); err != nil {
                return &parseError{Path: dirPath, File: file, Err: err}
        }
        stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"]
        // Unlike cgroup v1 which has memory.use_hierarchy binary knob,
        // cgroup v2 is always hierarchical.
        stats.MemoryStats.UseHierarchy = true

        memoryUsage, err := getMemoryDataV2(dirPath, "")
        if err != nil {
                if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
                        // The root cgroup does not have memory.{current,max,peak}
                        // so emulate those using data from /proc/meminfo and
                        // /sys/fs/cgroup/memory.stat
                        return rootStatsFromMeminfo(stats)
                }
                return err
        }
        stats.MemoryStats.Usage = memoryUsage
        swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap")
        if err != nil {
                return err
        }
        stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage
        swapUsage := swapOnlyUsage
        // As cgroup v1 reports SwapUsage values as mem+swap combined,
        // while in cgroup v2 swap values do not include memory,
        // report combined mem+swap for v1 compatibility.
        swapUsage.Usage += memoryUsage.Usage
        if swapUsage.Limit != math.MaxUint64 {
                swapUsage.Limit += memoryUsage.Limit
        }
        // The `MaxUsage` of mem+swap cannot simply combine mem with
        // swap. So set it to 0 for v1 compatibility.
        swapUsage.MaxUsage = 0
        stats.MemoryStats.SwapUsage = swapUsage

        return nil
}

func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
        memoryData := cgroups.MemoryData{}

        moduleName := "memory"
        if name != "" {
                moduleName = "memory." + name
        }
        usage := moduleName + ".current"
        limit := moduleName + ".max"
        maxUsage := moduleName + ".peak"

        value, err := fscommon.GetCgroupParamUint(path, usage)
        if err != nil {
                if name != "" && os.IsNotExist(err) {
                        // Ignore EEXIST as there's no swap accounting
                        // if kernel CONFIG_MEMCG_SWAP is not set or
                        // swapaccount=0 kernel boot parameter is given.
                        return cgroups.MemoryData{}, nil
                }
                return cgroups.MemoryData{}, err
        }
        memoryData.Usage = value

        value, err = fscommon.GetCgroupParamUint(path, limit)
        if err != nil {
                return cgroups.MemoryData{}, err
        }
        memoryData.Limit = value

        // `memory.peak` since kernel 5.19
        // `memory.swap.peak` since kernel 6.5
        value, err = fscommon.GetCgroupParamUint(path, maxUsage)
        if err != nil && !os.IsNotExist(err) {
                return cgroups.MemoryData{}, err
        }
        memoryData.MaxUsage = value

        return memoryData, nil
}

func rootStatsFromMeminfo(stats *cgroups.Stats) error {
        const file = "/proc/meminfo"
        f, err := os.Open(file)
        if err != nil {
                return err
        }
        defer f.Close()

        // Fields we are interested in.
        var (
                swap_free  uint64
                swap_total uint64
        )
        mem := map[string]*uint64{
                "SwapFree":  &swap_free,
                "SwapTotal": &swap_total,
        }

        found := 0
        sc := bufio.NewScanner(f)
        for sc.Scan() {
                parts := strings.SplitN(sc.Text(), ":", 3)
                if len(parts) != 2 {
                        // Should not happen.
                        continue
                }
                k := parts[0]
                p, ok := mem[k]
                if !ok {
                        // Unknown field -- not interested.
                        continue
                }
                vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB"))
                *p, err = strconv.ParseUint(vStr, 10, 64)
                if err != nil {
                        return &parseError{File: file, Err: errors.New("bad value for " + k)}
                }

                found++
                if found == len(mem) {
                        // Got everything we need -- skip the rest.
                        break
                }
        }
        if err := sc.Err(); err != nil {
                return &parseError{Path: "", File: file, Err: err}
        }

        // cgroup v1 `usage_in_bytes` reports memory usage as the sum of
        // - rss (NR_ANON_MAPPED)
        // - cache (NR_FILE_PAGES)
        // cgroup v1 reports SwapUsage values as mem+swap combined
        // cgroup v2 reports rss and cache as anon and file.
        // sum `anon` + `file` to report the same value as `usage_in_bytes` in v1.
        // sum swap usage as combined mem+swap usage for consistency as well.
        stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"]
        stats.MemoryStats.Usage.Limit = math.MaxUint64
        stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
        stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
        stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage

        return nil
}

package fs2

import (
        "bufio"
        "os"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

func statMisc(dirPath string, stats *cgroups.Stats) error {
        for _, file := range []string{"current", "events"} {
                fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY)
                if err != nil {
                        return err
                }

                s := bufio.NewScanner(fd)
                for s.Scan() {
                        key, value, err := fscommon.ParseKeyValue(s.Text())
                        if err != nil {
                                fd.Close()
                                return err
                        }

                        key = strings.TrimSuffix(key, ".max")

                        if _, ok := stats.MiscStats[key]; !ok {
                                stats.MiscStats[key] = cgroups.MiscStats{}
                        }

                        tmp := stats.MiscStats[key]

                        switch file {
                        case "current":
                                tmp.Usage = value
                        case "events":
                                tmp.Events = value
                        }

                        stats.MiscStats[key] = tmp
                }
                fd.Close()

                if err := s.Err(); err != nil {
                        return err
                }
        }

        return nil
}

package fs2

import (
        "errors"
        "math"
        "os"
        "strings"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)

func isPidsSet(r *cgroups.Resources) bool {
        return r.PidsLimit != 0
}

func setPids(dirPath string, r *cgroups.Resources) error {
        if !isPidsSet(r) {
                return nil
        }
        if val := numToStr(r.PidsLimit); val != "" {
                if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil {
                        return err
                }
        }

        return nil
}

func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
        // if the controller is not enabled, let's read PIDS from cgroups.procs
        // (or threads if cgroup.threads is enabled)
        contents, err := cgroups.ReadFile(dirPath, "cgroup.procs")
        if errors.Is(err, unix.ENOTSUP) {
                contents, err = cgroups.ReadFile(dirPath, "cgroup.threads")
        }
        if err != nil {
                return err
        }
        pids := strings.Count(contents, "\n")
        stats.PidsStats.Current = uint64(pids)
        stats.PidsStats.Limit = 0
        return nil
}

func statPids(dirPath string, stats *cgroups.Stats) error {
        current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
        if err != nil {
                if os.IsNotExist(err) {
                        return statPidsFromCgroupProcs(dirPath, stats)
                }
                return err
        }

        max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max")
        if err != nil {
                return err
        }
        // If no limit is set, read from pids.max returns "max", which is
        // converted to MaxUint64 by GetCgroupParamUint. Historically, we
        // represent "no limit" for pids as 0, thus this conversion.
        if max == math.MaxUint64 {
                max = 0
        }

        stats.PidsStats.Current = current
        stats.PidsStats.Limit = max
        return nil
}

package fs2

import (
        "bufio"
        "errors"
        "fmt"
        "os"
        "strconv"
        "strings"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

func statPSI(dirPath string, file string) (*cgroups.PSIStats, error) {
        f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
        if err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        // Kernel < 4.20, or CONFIG_PSI is not set,
                        // or PSI stats are turned off for the cgroup
                        // ("echo 0 > cgroup.pressure", kernel >= 6.1).
                        return nil, nil
                }
                return nil, err
        }
        defer f.Close()

        var psistats cgroups.PSIStats
        sc := bufio.NewScanner(f)
        for sc.Scan() {
                parts := strings.Fields(sc.Text())
                var pv *cgroups.PSIData
                switch parts[0] {
                case "some":
                        pv = &psistats.Some
                case "full":
                        pv = &psistats.Full
                }
                if pv != nil {
                        *pv, err = parsePSIData(parts[1:])
                        if err != nil {
                                return nil, &parseError{Path: dirPath, File: file, Err: err}
                        }
                }
        }
        if err := sc.Err(); err != nil {
                if errors.Is(err, unix.ENOTSUP) {
                        // Some kernels (e.g. CS9) may return ENOTSUP on read
                        // if psi=1 kernel cmdline parameter is required.
                        return nil, nil
                }
                return nil, &parseError{Path: dirPath, File: file, Err: err}
        }
        return &psistats, nil
}

func parsePSIData(psi []string) (cgroups.PSIData, error) {
        data := cgroups.PSIData{}
        for _, f := range psi {
                key, val, ok := strings.Cut(f, "=")
                if !ok {
                        return data, fmt.Errorf("invalid psi data: %q", f)
                }
                var pv *float64
                switch key {
                case "avg10":
                        pv = &data.Avg10
                case "avg60":
                        pv = &data.Avg60
                case "avg300":
                        pv = &data.Avg300
                case "total":
                        v, err := strconv.ParseUint(val, 10, 64)
                        if err != nil {
                                return data, fmt.Errorf("invalid %s PSI value: %w", key, err)
                        }
                        data.Total = v
                }
                if pv != nil {
                        v, err := strconv.ParseFloat(val, 64)
                        if err != nil {
                                return data, fmt.Errorf("invalid %s PSI value: %w", key, err)
                        }
                        *pv = v
                }
        }
        return data, nil
}

//go:build gofuzz
// +build gofuzz

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package fscommon

import (
        gofuzzheaders "github.com/AdaLogics/go-fuzz-headers"
        securejoin "github.com/cyphar/filepath-securejoin"
)

func FuzzSecurejoin(data []byte) int {
        c := gofuzzheaders.NewConsumer(data)
        dir, err := c.GetString()
        if err != nil {
                return 0
        }
        file, err := c.GetString()
        if err != nil {
                return 0
        }
        _, err = securejoin.SecureJoin(dir, file)
        if err != nil {
                return 0
        }
        return 1
}

package fscommon

import (
        "bufio"
        "errors"
        "math"
        "os"
        "strconv"
        "strings"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

// parseRdmaKV parses raw string to RdmaEntry.
func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error {
        var value uint32

        k, v, ok := strings.Cut(raw, "=")

        if !ok {
                return errors.New("Unable to parse RDMA entry")
        }

        if v == "max" {
                value = math.MaxUint32
        } else {
                val64, err := strconv.ParseUint(v, 10, 32)
                if err != nil {
                        return err
                }
                value = uint32(val64)
        }
        switch k {
        case "hca_handle":
                entry.HcaHandles = value
        case "hca_object":
                entry.HcaObjects = value
        }

        return nil
}

// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file.
// example entry: mlx4_0 hca_handle=2 hca_object=2000
func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) {
        rdmaEntries := make([]cgroups.RdmaEntry, 0)
        fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY)
        if err != nil {
                return nil, err
        }
        defer fd.Close() //nolint:errorlint
        scanner := bufio.NewScanner(fd)
        for scanner.Scan() {
                parts := strings.SplitN(scanner.Text(), " ", 4)
                if len(parts) == 3 {
                        entry := new(cgroups.RdmaEntry)
                        entry.Device = parts[0]
                        err = parseRdmaKV(parts[1], entry)
                        if err != nil {
                                continue
                        }
                        err = parseRdmaKV(parts[2], entry)
                        if err != nil {
                                continue
                        }

                        rdmaEntries = append(rdmaEntries, *entry)
                }
        }
        return rdmaEntries, scanner.Err()
}

// RdmaGetStats returns rdma stats such as totalLimit and current entries.
func RdmaGetStats(path string, stats *cgroups.Stats) error {
        currentEntries, err := readRdmaEntries(path, "rdma.current")
        if err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        err = nil
                }
                return err
        }
        maxEntries, err := readRdmaEntries(path, "rdma.max")
        if err != nil {
                return err
        }
        // If device got removed between reading two files, ignore returning stats.
        if len(currentEntries) != len(maxEntries) {
                return nil
        }

        stats.RdmaStats = cgroups.RdmaStats{
                RdmaLimit:   maxEntries,
                RdmaCurrent: currentEntries,
        }

        return nil
}

func createCmdString(device string, limits cgroups.LinuxRdma) string {
        cmdString := device
        if limits.HcaHandles != nil {
                cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10)
        }
        if limits.HcaObjects != nil {
                cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10)
        }
        return cmdString
}

// RdmaSet sets RDMA resources.
func RdmaSet(path string, r *cgroups.Resources) error {
        for device, limits := range r.Rdma {
                if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil {
                        return err
                }
        }
        return nil
}

package fscommon

import (
        "errors"
        "fmt"
        "math"
        "path"
        "strconv"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

var (
        // Deprecated: use cgroups.OpenFile instead.
        OpenFile = cgroups.OpenFile
        // Deprecated: use cgroups.ReadFile instead.
        ReadFile = cgroups.ReadFile
        // Deprecated: use cgroups.WriteFile instead.
        WriteFile = cgroups.WriteFile
)

// ParseError records a parse error details, including the file path.
type ParseError struct {
        Path string
        File string
        Err  error
}

func (e *ParseError) Error() string {
        return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error()
}

func (e *ParseError) Unwrap() error { return e.Err }

// ParseUint converts a string to an uint64 integer.
// Negative values are returned at zero as, due to kernel bugs,
// some of the memory cgroup stats can be negative.
func ParseUint(s string, base, bitSize int) (uint64, error) {
        value, err := strconv.ParseUint(s, base, bitSize)
        if err != nil {
                intValue, intErr := strconv.ParseInt(s, base, bitSize)
                // 1. Handle negative values greater than MinInt64 (and)
                // 2. Handle negative values lesser than MinInt64
                if intErr == nil && intValue < 0 {
                        return 0, nil
                } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 {
                        return 0, nil
                }

                return value, err
        }

        return value, nil
}

// ParseKeyValue parses a space-separated "key value" kind of cgroup
// parameter and returns its key as a string, and its value as uint64
// (using [ParseUint] to convert the value). For example,
// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
func ParseKeyValue(t string) (string, uint64, error) {
        key, val, ok := strings.Cut(t, " ")
        if !ok || key == "" || val == "" {
                return "", 0, fmt.Errorf(`line %q is not in "key value" format`, t)
        }

        value, err := ParseUint(val, 10, 64)
        if err != nil {
                return "", 0, err
        }

        return key, value, nil
}

// GetValueByKey reads space-separated "key value" pairs from the specified
// cgroup file, looking for a specified key, and returns its value as uint64,
// using [ParseUint] for conversion. If the value is not found, 0 is returned.
func GetValueByKey(path, file, key string) (uint64, error) {
        content, err := cgroups.ReadFile(path, file)
        if err != nil {
                return 0, err
        }

        key += " "
        lines := strings.Split(content, "\n")
        for _, line := range lines {
                v, ok := strings.CutPrefix(line, key)
                if ok {
                        val, err := ParseUint(v, 10, 64)
                        if err != nil {
                                err = &ParseError{Path: path, File: file, Err: err}
                        }
                        return val, err
                }
        }

        return 0, nil
}

// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
// If the value read is "max", the math.MaxUint64 is returned.
func GetCgroupParamUint(path, file string) (uint64, error) {
        contents, err := GetCgroupParamString(path, file)
        if err != nil {
                return 0, err
        }
        if contents == "max" {
                return math.MaxUint64, nil
        }

        res, err := ParseUint(contents, 10, 64)
        if err != nil {
                return res, &ParseError{Path: path, File: file, Err: err}
        }
        return res, nil
}

// GetCgroupParamInt reads a single int64 value from specified cgroup file.
// If the value read is "max", the math.MaxInt64 is returned.
func GetCgroupParamInt(path, file string) (int64, error) {
        contents, err := GetCgroupParamString(path, file)
        if err != nil {
                return 0, err
        }
        if contents == "max" {
                return math.MaxInt64, nil
        }

        res, err := strconv.ParseInt(contents, 10, 64)
        if err != nil {
                return res, &ParseError{Path: path, File: file, Err: err}
        }
        return res, nil
}

// GetCgroupParamString reads a string from the specified cgroup file.
func GetCgroupParamString(path, file string) (string, error) {
        contents, err := cgroups.ReadFile(path, file)
        if err != nil {
                return "", err
        }

        return strings.TrimSpace(contents), nil
}

package cgroups

import (
        "io/fs"
        "path/filepath"
)

// GetAllPids returns all pids from the cgroup identified by path, and all its
// sub-cgroups.
func GetAllPids(path string) ([]int, error) {
        var pids []int
        err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error {
                if iErr != nil {
                        return iErr
                }
                if !d.IsDir() {
                        return nil
                }
                cPids, err := readProcsFile(p)
                if err != nil {
                        return err
                }
                pids = append(pids, cPids...)
                return nil
        })
        return pids, err
}

package path

import (
        "errors"
        "os"
        "path/filepath"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

// Inner returns a path to cgroup relative to a cgroup mount point, based
// on cgroup configuration, or an error, if cgroup configuration is invalid.
// To be used only by fs cgroup managers (systemd has different path rules).
func Inner(c *cgroups.Cgroup) (string, error) {
        if (c.Name != "" || c.Parent != "") && c.Path != "" {
                return "", errors.New("cgroup: either Path or Name and Parent should be used")
        }

        // XXX: Do not remove cleanPath. Path safety is important! -- cyphar
        innerPath := cleanPath(c.Path)
        if innerPath == "" {
                cgParent := cleanPath(c.Parent)
                cgName := cleanPath(c.Name)
                innerPath = filepath.Join(cgParent, cgName)
        }

        return innerPath, nil
}

// cleanPath is a copy of github.com/opencontainers/runc/libcontainer/utils.CleanPath.
func cleanPath(path string) string {
        // Deal with empty strings nicely.
        if path == "" {
                return ""
        }

        // Ensure that all paths are cleaned (especially problematic ones like
        // "/../../../../../" which can cause lots of issues).

        if filepath.IsAbs(path) {
                return filepath.Clean(path)
        }

        // If the path isn't absolute, we need to do more processing to fix paths
        // such as "../../../../<etc>/some/path". We also shouldn't convert absolute
        // paths to relative ones.
        path = filepath.Clean(string(os.PathSeparator) + path)
        // This can't fail, as (by definition) all paths are relative to root.
        path, _ = filepath.Rel(string(os.PathSeparator), path)

        return path
}

package manager

import (
        "errors"
        "fmt"
        "path/filepath"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fs"
        "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
        "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
)

// New returns the instance of a cgroup manager, which is chosen
// based on the local environment (whether cgroup v1 or v2 is used)
// and the config (whether config.Systemd is set or not).
func New(config *cgroups.Cgroup) (cgroups.Manager, error) {
        return NewWithPaths(config, nil)
}

// NewWithPaths is similar to New, and can be used in case cgroup paths
// are already well known, which can save some resources.
//
// For cgroup v1, the keys are controller/subsystem name, and the values
// are absolute filesystem paths to the appropriate cgroups.
//
// For cgroup v2, the only key allowed is "" (empty string), and the value
// is the unified cgroup path.
func NewWithPaths(config *cgroups.Cgroup, paths map[string]string) (cgroups.Manager, error) {
        if config == nil {
                return nil, errors.New("cgroups/manager.New: config must not be nil")
        }
        if config.Systemd && !systemd.IsRunningSystemd() {
                return nil, errors.New("systemd not running on this host, cannot use systemd cgroups manager")
        }

        // Cgroup v2 aka unified hierarchy.
        if cgroups.IsCgroup2UnifiedMode() {
                path, err := getUnifiedPath(paths)
                if err != nil {
                        return nil, fmt.Errorf("manager.NewWithPaths: inconsistent paths: %w", err)
                }
                if config.Systemd {
                        return systemd.NewUnifiedManager(config, path)
                }
                return fs2.NewManager(config, path)
        }

        // Cgroup v1.
        if config.Systemd {
                return systemd.NewLegacyManager(config, paths)
        }

        return fs.NewManager(config, paths)
}

// getUnifiedPath is an implementation detail of libcontainer.
// Historically, libcontainer.Create saves cgroup paths as per-subsystem path
// map (as returned by cm.GetPaths(""), but with v2 we only have one single
// unified path (with "" as a key).
//
// This function converts from that map to string (using "" as a key),
// and also checks that the map itself is sane.
func getUnifiedPath(paths map[string]string) (string, error) {
        if len(paths) > 1 {
                return "", fmt.Errorf("expected a single path, got %+v", paths)
        }
        path := paths[""]
        // can be empty
        if path != "" {
                if filepath.Clean(path) != path || !filepath.IsAbs(path) {
                        return "", fmt.Errorf("invalid path: %q", path)
                }
        }

        return path, nil
}

package cgroups

type ThrottlingData struct {
        // Number of periods with throttling active
        Periods uint64 `json:"periods,omitempty"`
        // Number of periods when the container hit its throttling limit.
        ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
        // Aggregate time the container was throttled for in nanoseconds.
        ThrottledTime uint64 `json:"throttled_time,omitempty"`
}

// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception.
type CpuUsage struct {
        // Total CPU time consumed.
        // Units: nanoseconds.
        TotalUsage uint64 `json:"total_usage,omitempty"`
        // Total CPU time consumed per core.
        // Units: nanoseconds.
        PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
        // CPU time consumed per core in kernel mode
        // Units: nanoseconds.
        PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"`
        // CPU time consumed per core in user mode
        // Units: nanoseconds.
        PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"`
        // Time spent by tasks of the cgroup in kernel mode.
        // Units: nanoseconds.
        UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
        // Time spent by tasks of the cgroup in user mode.
        // Units: nanoseconds.
        UsageInUsermode uint64 `json:"usage_in_usermode"`
}

type PSIData struct {
        Avg10  float64 `json:"avg10"`
        Avg60  float64 `json:"avg60"`
        Avg300 float64 `json:"avg300"`
        Total  uint64  `json:"total"`
}

type PSIStats struct {
        Some PSIData `json:"some,omitempty"`
        Full PSIData `json:"full,omitempty"`
}

type CpuStats struct {
        CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
        ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
        PSI            *PSIStats      `json:"psi,omitempty"`
}

type CPUSetStats struct {
        // List of the physical numbers of the CPUs on which processes
        // in that cpuset are allowed to execute
        CPUs []uint16 `json:"cpus,omitempty"`
        // cpu_exclusive flag
        CPUExclusive uint64 `json:"cpu_exclusive"`
        // List of memory nodes on which processes in that cpuset
        // are allowed to allocate memory
        Mems []uint16 `json:"mems,omitempty"`
        // mem_hardwall flag
        MemHardwall uint64 `json:"mem_hardwall"`
        // mem_exclusive flag
        MemExclusive uint64 `json:"mem_exclusive"`
        // memory_migrate flag
        MemoryMigrate uint64 `json:"memory_migrate"`
        // memory_spread page flag
        MemorySpreadPage uint64 `json:"memory_spread_page"`
        // memory_spread slab flag
        MemorySpreadSlab uint64 `json:"memory_spread_slab"`
        // memory_pressure
        MemoryPressure uint64 `json:"memory_pressure"`
        // sched_load balance flag
        SchedLoadBalance uint64 `json:"sched_load_balance"`
        // sched_relax_domain_level
        SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
}

type MemoryData struct {
        Usage    uint64 `json:"usage,omitempty"`
        MaxUsage uint64 `json:"max_usage,omitempty"`
        Failcnt  uint64 `json:"failcnt"`
        Limit    uint64 `json:"limit"`
}

type MemoryStats struct {
        // memory used for cache
        Cache uint64 `json:"cache,omitempty"`
        // usage of memory
        Usage MemoryData `json:"usage,omitempty"`
        // usage of memory + swap
        SwapUsage MemoryData `json:"swap_usage,omitempty"`
        // usage of swap only
        SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
        // usage of kernel memory
        KernelUsage MemoryData `json:"kernel_usage,omitempty"`
        // usage of kernel TCP memory
        KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
        // usage of memory pages by NUMA node
        // see chapter 5.6 of memory controller documentation
        PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"`
        // if true, memory usage is accounted for throughout a hierarchy of cgroups.
        UseHierarchy bool `json:"use_hierarchy"`

        Stats map[string]uint64 `json:"stats,omitempty"`
        PSI   *PSIStats         `json:"psi,omitempty"`
}

type PageUsageByNUMA struct {
        // Embedding is used as types can't be recursive.
        PageUsageByNUMAInner
        Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"`
}

type PageUsageByNUMAInner struct {
        Total       PageStats `json:"total,omitempty"`
        File        PageStats `json:"file,omitempty"`
        Anon        PageStats `json:"anon,omitempty"`
        Unevictable PageStats `json:"unevictable,omitempty"`
}

type PageStats struct {
        Total uint64           `json:"total,omitempty"`
        Nodes map[uint8]uint64 `json:"nodes,omitempty"`
}

type PidsStats struct {
        // number of pids in the cgroup
        Current uint64 `json:"current,omitempty"`
        // active pids hard limit
        Limit uint64 `json:"limit,omitempty"`
}

type BlkioStatEntry struct {
        Major uint64 `json:"major,omitempty"`
        Minor uint64 `json:"minor,omitempty"`
        Op    string `json:"op,omitempty"`
        Value uint64 `json:"value,omitempty"`
}

type BlkioStats struct {
        // number of bytes transferred to and from the block device
        IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
        IoServicedRecursive     []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
        IoQueuedRecursive       []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
        IoServiceTimeRecursive  []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
        IoWaitTimeRecursive     []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
        IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
        IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
        SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
        PSI                     *PSIStats        `json:"psi,omitempty"`
}

type HugetlbStats struct {
        // current res_counter usage for hugetlb
        Usage uint64 `json:"usage,omitempty"`
        // maximum usage ever recorded.
        MaxUsage uint64 `json:"max_usage,omitempty"`
        // number of times hugetlb usage allocation failure.
        Failcnt uint64 `json:"failcnt"`
}

type RdmaEntry struct {
        Device     string `json:"device,omitempty"`
        HcaHandles uint32 `json:"hca_handles,omitempty"`
        HcaObjects uint32 `json:"hca_objects,omitempty"`
}

type RdmaStats struct {
        RdmaLimit   []RdmaEntry `json:"rdma_limit,omitempty"`
        RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
}

type MiscStats struct {
        // current resource usage for a key in misc
        Usage uint64 `json:"usage,omitempty"`
        // number of times the resource usage was about to go over the max boundary
        Events uint64 `json:"events,omitempty"`
}

type Stats struct {
        CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
        CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
        MemoryStats MemoryStats `json:"memory_stats,omitempty"`
        PidsStats   PidsStats   `json:"pids_stats,omitempty"`
        BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
        // the map is in the format "size of hugepage: stats of the hugepage"
        HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
        RdmaStats    RdmaStats               `json:"rdma_stats,omitempty"`
        // the map is in the format "misc resource name: stats of the key"
        MiscStats map[string]MiscStats `json:"misc_stats,omitempty"`
}

func NewStats() *Stats {
        memoryStats := MemoryStats{Stats: make(map[string]uint64)}
        hugetlbStats := make(map[string]HugetlbStats)
        miscStats := make(map[string]MiscStats)
        return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats}
}

package systemd

import (
        "context"
        "errors"
        "fmt"
        "math"
        "os"
        "strconv"
        "strings"
        "sync"
        "time"

        systemdDbus "github.com/coreos/go-systemd/v22/dbus"
        dbus "github.com/godbus/dbus/v5"
        "github.com/sirupsen/logrus"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

const (
        // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
        // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
        // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
        defCPUQuotaPeriod = uint64(100000)
)

var (
        versionOnce sync.Once
        version     int

        isRunningSystemdOnce sync.Once
        isRunningSystemd     bool

        // GenerateDeviceProps is a function to generate systemd device
        // properties, used by Set methods. Unless
        // [github.com/opencontainers/runc/libcontainer/cgroups/devices]
        // package is imported, it is set to nil, so cgroup managers can't
        // configure devices.
        GenerateDeviceProps func(r *cgroups.Resources, sdVer int) ([]systemdDbus.Property, error)
)

// NOTE: This function comes from package github.com/coreos/go-systemd/util
// It was borrowed here to avoid a dependency on cgo.
//
// IsRunningSystemd checks whether the host was booted with systemd as its init
// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
// checks whether /run/systemd/system/ exists and is a directory.
// http://www.freedesktop.org/software/systemd/man/sd_booted.html
func IsRunningSystemd() bool {
        isRunningSystemdOnce.Do(func() {
                fi, err := os.Lstat("/run/systemd/system")
                isRunningSystemd = err == nil && fi.IsDir()
        })
        return isRunningSystemd
}

// systemd represents slice hierarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// /test.slice/test-a.slice/test-a-b.slice.
func ExpandSlice(slice string) (string, error) {
        suffix := ".slice"
        // Name has to end with ".slice", but can't be just ".slice".
        if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
                return "", fmt.Errorf("invalid slice name: %s", slice)
        }

        // Path-separators are not allowed.
        if strings.Contains(slice, "/") {
                return "", fmt.Errorf("invalid slice name: %s", slice)
        }

        var path, prefix string
        sliceName := strings.TrimSuffix(slice, suffix)
        // if input was -.slice, we should just return root now
        if sliceName == "-" {
                return "/", nil
        }
        for _, component := range strings.Split(sliceName, "-") {
                // test--a.slice isn't permitted, nor is -test.slice.
                if component == "" {
                        return "", fmt.Errorf("invalid slice name: %s", slice)
                }

                // Append the component to the path and to the prefix.
                path += "/" + prefix + component + suffix
                prefix += component + "-"
        }
        return path, nil
}

func newProp(name string, units interface{}) systemdDbus.Property {
        return systemdDbus.Property{
                Name:  name,
                Value: dbus.MakeVariant(units),
        }
}

func getUnitName(c *cgroups.Cgroup) string {
        // by default, we create a scope unless the user explicitly asks for a slice.
        if !strings.HasSuffix(c.Name, ".slice") {
                return c.ScopePrefix + "-" + c.Name + ".scope"
        }
        return c.Name
}

// This code should be in sync with getUnitName.
func getUnitType(unitName string) string {
        if strings.HasSuffix(unitName, ".slice") {
                return "Slice"
        }
        return "Scope"
}

// isDbusError returns true if the error is a specific dbus error.
func isDbusError(err error, name string) bool {
        if err != nil {
                var derr dbus.Error
                if errors.As(err, &derr) {
                        return strings.Contains(derr.Name, name)
                }
        }
        return false
}

// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
        return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
}

func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
        statusChan := make(chan string, 1)
        retry := true

retry:
        err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
                _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
                return err
        })
        if err != nil {
                if !isUnitExists(err) {
                        return err
                }
                if ignoreExist {
                        // TODO: remove this hack.
                        // This is kubelet making sure a slice exists (see
                        // https://github.com/opencontainers/runc/pull/1124).
                        return nil
                }
                if retry {
                        // In case a unit with the same name exists, this may
                        // be a leftover failed unit. Reset it, so systemd can
                        // remove it, and retry once.
                        err = resetFailedUnit(cm, unitName)
                        if err != nil {
                                logrus.Warnf("unable to reset failed unit: %v", err)
                        }
                        retry = false
                        goto retry
                }
                return err
        }

        timeout := time.NewTimer(30 * time.Second)
        defer timeout.Stop()

        select {
        case s := <-statusChan:
                close(statusChan)
                // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
                if s != "done" {
                        _ = resetFailedUnit(cm, unitName)
                        return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
                }
        case <-timeout.C:
                _ = resetFailedUnit(cm, unitName)
                return errors.New("Timeout waiting for systemd to create " + unitName)
        }

        return nil
}

func stopUnit(cm *dbusConnManager, unitName string) error {
        statusChan := make(chan string, 1)
        err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
                _, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
                return err
        })
        if err == nil {
                timeout := time.NewTimer(30 * time.Second)
                defer timeout.Stop()

                select {
                case s := <-statusChan:
                        close(statusChan)
                        // Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
                        if s != "done" {
                                logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
                        }
                case <-timeout.C:
                        return errors.New("Timed out while waiting for systemd to remove " + unitName)
                }
        }

        // In case of a failed unit, let systemd remove it.
        _ = resetFailedUnit(cm, unitName)

        return nil
}

func resetFailedUnit(cm *dbusConnManager, name string) error {
        return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
                return c.ResetFailedUnitContext(context.TODO(), name)
        })
}

func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
        var prop *systemdDbus.Property
        err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) {
                prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName)
                return Err
        })
        return prop, err
}

func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
        return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
                return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
        })
}

func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
        str := ""
        err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
                var err error
                str, err = c.GetManagerProperty(name)
                return err
        })
        if err != nil {
                return "", err
        }
        return strconv.Unquote(str)
}

func systemdVersion(cm *dbusConnManager) int {
        versionOnce.Do(func() {
                version = -1
                verStr, err := getManagerProperty(cm, "Version")
                if err == nil {
                        version, err = systemdVersionAtoi(verStr)
                }

                if err != nil {
                        logrus.WithError(err).Error("unable to get systemd version")
                }
        })

        return version
}

// systemdVersionAtoi extracts a numeric systemd version from the argument.
// The argument should be of the form: "v245.4-1.fc32", "245", "v245-1.fc32",
// "245-1.fc32" (with or without quotes). The result for all of the above
// should be 245.
func systemdVersionAtoi(str string) (int, error) {
        // Unconditionally remove the leading prefix ("v).
        str = strings.TrimLeft(str, `"v`)
        // Match on the first integer we can grab.
        for i := 0; i < len(str); i++ {
                if str[i] < '0' || str[i] > '9' {
                        // First non-digit: cut the tail.
                        str = str[:i]
                        break
                }
        }
        ver, err := strconv.Atoi(str)
        if err != nil {
                return -1, fmt.Errorf("can't parse version: %w", err)
        }
        return ver, nil
}

func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
        if period != 0 {
                // systemd only supports CPUQuotaPeriodUSec since v242
                sdVer := systemdVersion(cm)
                if sdVer >= 242 {
                        *properties = append(*properties,
                                newProp("CPUQuotaPeriodUSec", period))
                } else {
                        logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
                                " (setting will still be applied to cgroupfs)", sdVer)
                }
        }
        if quota != 0 || period != 0 {
                // corresponds to USEC_INFINITY in systemd
                cpuQuotaPerSecUSec := uint64(math.MaxUint64)
                if quota > 0 {
                        if period == 0 {
                                // assume the default
                                period = defCPUQuotaPeriod
                        }
                        // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
                        // (integer percentage of CPU) internally.  This means that if a fractional percent of
                        // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
                        // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
                        cpuQuotaPerSecUSec = uint64(quota*1000000) / period
                        if cpuQuotaPerSecUSec%10000 != 0 {
                                cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
                        }
                }
                *properties = append(*properties,
                        newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
        }
}

func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
        if cpus == "" && mems == "" {
                return nil
        }

        // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
        sdVer := systemdVersion(cm)
        if sdVer < 244 {
                logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
                        " (settings will still be applied to cgroupfs)", sdVer)
                return nil
        }

        if cpus != "" {
                bits, err := RangeToBits(cpus)
                if err != nil {
                        return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
                                cpus, err)
                }
                *props = append(*props,
                        newProp("AllowedCPUs", bits))
        }
        if mems != "" {
                bits, err := RangeToBits(mems)
                if err != nil {
                        return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
                                mems, err)
                }
                *props = append(*props,
                        newProp("AllowedMemoryNodes", bits))
        }
        return nil
}

// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
        if GenerateDeviceProps == nil {
                if len(r.Devices) > 0 {
                        return nil, cgroups.ErrDevicesUnsupported
                }
                return nil, nil
        }

        return GenerateDeviceProps(r, systemdVersion(cm))
}

package systemd

import (
        "errors"
        "math/big"
        "strconv"
        "strings"
)

// RangeToBits converts a text representation of a CPU mask (as written to
// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
// with the corresponding bits set (as consumed by systemd over dbus as
// AllowedCPUs/AllowedMemoryNodes unit property value).
func RangeToBits(str string) ([]byte, error) {
        bits := new(big.Int)

        for _, r := range strings.Split(str, ",") {
                // allow extra spaces around
                r = strings.TrimSpace(r)
                // allow empty elements (extra commas)
                if r == "" {
                        continue
                }
                startr, endr, ok := strings.Cut(r, "-")
                if ok {
                        start, err := strconv.ParseUint(startr, 10, 32)
                        if err != nil {
                                return nil, err
                        }
                        end, err := strconv.ParseUint(endr, 10, 32)
                        if err != nil {
                                return nil, err
                        }
                        if start > end {
                                return nil, errors.New("invalid range: " + r)
                        }
                        for i := start; i <= end; i++ {
                                bits.SetBit(bits, int(i), 1)
                        }
                } else {
                        val, err := strconv.ParseUint(startr, 10, 32)
                        if err != nil {
                                return nil, err
                        }
                        bits.SetBit(bits, int(val), 1)
                }
        }

        ret := bits.Bytes()
        if len(ret) == 0 {
                // do not allow empty values
                return nil, errors.New("empty value")
        }

        // fit cpuset parsing order in systemd
        for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 {
                ret[l], ret[r] = ret[r], ret[l]
        }
        return ret, nil
}

package systemd

import (
        "context"
        "errors"
        "fmt"
        "sync"

        systemdDbus "github.com/coreos/go-systemd/v22/dbus"
        dbus "github.com/godbus/dbus/v5"
)

var (
        dbusC        *systemdDbus.Conn
        dbusMu       sync.RWMutex
        dbusInited   bool
        dbusRootless bool
)

type dbusConnManager struct{}

// newDbusConnManager initializes systemd dbus connection manager.
func newDbusConnManager(rootless bool) *dbusConnManager {
        dbusMu.Lock()
        defer dbusMu.Unlock()
        if dbusInited && rootless != dbusRootless {
                panic("can't have both root and rootless dbus")
        }
        dbusInited = true
        dbusRootless = rootless
        return &dbusConnManager{}
}

// getConnection lazily initializes and returns systemd dbus connection.
func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) {
        // In the case where dbusC != nil
        // Use the read lock the first time to ensure
        // that Conn can be acquired at the same time.
        dbusMu.RLock()
        if conn := dbusC; conn != nil {
                dbusMu.RUnlock()
                return conn, nil
        }
        dbusMu.RUnlock()

        // In the case where dbusC == nil
        // Use write lock to ensure that only one
        // will be created
        dbusMu.Lock()
        defer dbusMu.Unlock()
        if conn := dbusC; conn != nil {
                return conn, nil
        }

        conn, err := d.newConnection()
        if err != nil {
                // When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false.
                // This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown."
                // https://github.com/moby/moby/issues/42793
                return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err)
        }
        dbusC = conn
        return conn, nil
}

func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) {
        if dbusRootless {
                return newUserSystemdDbus()
        }
        return systemdDbus.NewWithContext(context.TODO())
}

// resetConnection resets the connection to its initial state
// (so it can be reconnected if necessary).
func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
        dbusMu.Lock()
        defer dbusMu.Unlock()
        if dbusC != nil && dbusC == conn {
                dbusC.Close()
                dbusC = nil
        }
}

// retryOnDisconnect calls op, and if the error it returns is about closed dbus
// connection, the connection is re-established and the op is retried. This helps
// with the situation when dbus is restarted and we have a stale connection.
func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error {
        for {
                conn, err := d.getConnection()
                if err != nil {
                        return err
                }
                err = op(conn)
                if err == nil {
                        return nil
                }
                if !errors.Is(err, dbus.ErrClosed) {
                        return err
                }
                d.resetConnection(conn)
        }
}

package systemd

import (
        "reflect"

        dbus "github.com/godbus/dbus/v5"

        "github.com/opencontainers/runc/libcontainer/cgroups"
)

// freezeBeforeSet answers whether there is a need to freeze the cgroup before
// applying its systemd unit properties, and thaw after, while avoiding
// unnecessary freezer state changes.
//
// The reason why we have to freeze is that systemd's application of device
// rules is done disruptively, resulting in spurious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we have to freeze the container to avoid the container get
// an occasional "permission denied" error.
func (m *LegacyManager) freezeBeforeSet(unitName string, r *cgroups.Resources) (needsFreeze, needsThaw bool, err error) {
        // Special case for SkipDevices, as used by Kubernetes to create pod
        // cgroups with allow-all device policy).
        if r.SkipDevices {
                if r.SkipFreezeOnSet {
                        // Both needsFreeze and needsThaw are false.
                        return
                }

                // No need to freeze if SkipDevices is set, and either
                // (1) systemd unit does not (yet) exist, or
                // (2) it has DevicePolicy=auto and empty DeviceAllow list.
                //
                // Interestingly, (1) and (2) are the same here because
                // a non-existent unit returns default properties,
                // and settings in (2) are the defaults.
                //
                // Do not return errors from getUnitTypeProperty, as they alone
                // should not prevent Set from working.

                unitType := getUnitType(unitName)

                devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
                if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
                        devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
                        if e == nil {
                                if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
                                        needsFreeze = false
                                        needsThaw = false
                                        return
                                }
                        }
                }
        }

        needsFreeze = true
        needsThaw = true

        // Check the current freezer state.
        freezerState, err := m.GetFreezerState()
        if err != nil {
                return
        }
        if freezerState == cgroups.Frozen {
                // Already frozen, and should stay frozen.
                needsFreeze = false
                needsThaw = false
        }

        if r.Freezer == cgroups.Frozen {
                // Will be frozen anyway -- no need to thaw.
                needsThaw = false
        }
        return
}

package systemd

import (
        "bufio"
        "bytes"
        "errors"
        "fmt"
        "os"
        "os/exec"
        "path/filepath"
        "strconv"
        "strings"

        systemdDbus "github.com/coreos/go-systemd/v22/dbus"
        dbus "github.com/godbus/dbus/v5"
        "github.com/moby/sys/userns"
)

// newUserSystemdDbus creates a connection for systemd user-instance.
func newUserSystemdDbus() (*systemdDbus.Conn, error) {
        addr, err := DetectUserDbusSessionBusAddress()
        if err != nil {
                return nil, err
        }
        uid, err := DetectUID()
        if err != nil {
                return nil, err
        }

        return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
                conn, err := dbus.Dial(addr)
                if err != nil {
                        return nil, fmt.Errorf("error while dialing %q: %w", addr, err)
                }
                methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
                err = conn.Auth(methods)
                if err != nil {
                        conn.Close()
                        return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err)
                }
                if err = conn.Hello(); err != nil {
                        conn.Close()
                        return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err)
                }
                return conn, nil
        })
}

// DetectUID detects UID from the OwnerUID field of `busctl --user status`
// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) .
//
// Otherwise returns os.Getuid() .
func DetectUID() (int, error) {
        if !userns.RunningInUserNS() {
                return os.Getuid(), nil
        }
        b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
        if err != nil {
                return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err)
        }
        scanner := bufio.NewScanner(bytes.NewReader(b))
        for scanner.Scan() {
                s := strings.TrimSpace(scanner.Text())
                if uidStr, ok := strings.CutPrefix(s, "OwnerUID="); ok {
                        i, err := strconv.Atoi(uidStr)
                        if err != nil {
                                return -1, fmt.Errorf("could not detect the OwnerUID: %w", err)
                        }
                        return i, nil
                }
        }
        if err := scanner.Err(); err != nil {
                return -1, err
        }
        return -1, errors.New("could not detect the OwnerUID")
}

// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS, if set.
// Otherwise it returns "unix:path=$XDG_RUNTIME_DIR/bus", if $XDG_RUNTIME_DIR/bus exists.
func DetectUserDbusSessionBusAddress() (string, error) {
        if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" {
                return env, nil
        }
        if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" {
                busPath := filepath.Join(xdr, "bus")
                if _, err := os.Stat(busPath); err == nil {
                        busAddress := "unix:path=" + dbus.EscapeBusAddressValue(busPath)
                        return busAddress, nil
                }
        }
        return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from the environment; make sure you have installed the dbus-user-session or dbus-daemon package; note you may need to re-login")
}

package systemd

import (
        "errors"
        "os"
        "path/filepath"
        "strings"
        "sync"

        systemdDbus "github.com/coreos/go-systemd/v22/dbus"
        "github.com/sirupsen/logrus"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fs"
)

type LegacyManager struct {
        mu      sync.Mutex
        cgroups *cgroups.Cgroup
        paths   map[string]string
        dbus    *dbusConnManager
}

func NewLegacyManager(cg *cgroups.Cgroup, paths map[string]string) (*LegacyManager, error) {
        if cg.Rootless {
                return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
        }
        if cg.Resources != nil && cg.Resources.Unified != nil {
                return nil, cgroups.ErrV1NoUnified
        }
        if paths == nil {
                var err error
                paths, err = initPaths(cg)
                if err != nil {
                        return nil, err
                }
        }
        return &LegacyManager{
                cgroups: cg,
                paths:   paths,
                dbus:    newDbusConnManager(false),
        }, nil
}

type subsystem interface {
        // Name returns the name of the subsystem.
        Name() string
        // GetStats returns the stats, as 'stats', corresponding to the cgroup under 'path'.
        GetStats(path string, stats *cgroups.Stats) error
        // Set sets cgroup resource limits.
        Set(path string, r *cgroups.Resources) error
}

var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")

var legacySubsystems = []subsystem{
        &fs.CpusetGroup{},
        &fs.DevicesGroup{},
        &fs.MemoryGroup{},
        &fs.CpuGroup{},
        &fs.CpuacctGroup{},
        &fs.PidsGroup{},
        &fs.BlkioGroup{},
        &fs.HugetlbGroup{},
        &fs.PerfEventGroup{},
        &fs.FreezerGroup{},
        &fs.NetPrioGroup{},
        &fs.NetClsGroup{},
        &fs.NameGroup{GroupName: "name=systemd"},
        &fs.RdmaGroup{},
        &fs.NameGroup{GroupName: "misc"},
}

func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
        var properties []systemdDbus.Property

        deviceProperties, err := generateDeviceProperties(r, cm)
        if err != nil {
                return nil, err
        }
        properties = append(properties, deviceProperties...)

        if r.Memory != 0 {
                properties = append(properties,
                        newProp("MemoryLimit", uint64(r.Memory)))
        }

        if r.CpuShares != 0 {
                properties = append(properties,
                        newProp("CPUShares", r.CpuShares))
        }

        addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)

        if r.BlkioWeight != 0 {
                properties = append(properties,
                        newProp("BlockIOWeight", uint64(r.BlkioWeight)))
        }

        if r.PidsLimit > 0 || r.PidsLimit == -1 {
                properties = append(properties,
                        newProp("TasksMax", uint64(r.PidsLimit)))
        }

        err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
        if err != nil {
                return nil, err
        }

        return properties, nil
}

// initPaths figures out and returns paths to cgroups.
func initPaths(c *cgroups.Cgroup) (map[string]string, error) {
        slice := "system.slice"
        if c.Parent != "" {
                var err error
                slice, err = ExpandSlice(c.Parent)
                if err != nil {
                        return nil, err
                }
        }

        unit := getUnitName(c)

        paths := make(map[string]string)
        for _, s := range legacySubsystems {
                subsystemPath, err := getSubsystemPath(slice, unit, s.Name())
                if err != nil {
                        // Even if it's `not found` error, we'll return err
                        // because devices cgroup is hard requirement for
                        // container security.
                        if s.Name() == "devices" {
                                return nil, err
                        }
                        // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
                        if cgroups.IsNotFound(err) {
                                continue
                        }
                        return nil, err
                }
                paths[s.Name()] = subsystemPath
        }

        // If systemd is using cgroups-hybrid mode then add the slice path of
        // this container to the paths so the following process executed with
        // "runc exec" joins that cgroup as well.
        if cgroups.IsCgroup2HybridMode() {
                // "" means cgroup-hybrid path
                cgroupsHybridPath, err := getSubsystemPath(slice, unit, "")
                if err != nil && cgroups.IsNotFound(err) {
                        return nil, err
                }
                paths[""] = cgroupsHybridPath
        }

        return paths, nil
}

func (m *LegacyManager) Apply(pid int) error {
        var (
                c          = m.cgroups
                unitName   = getUnitName(c)
                slice      = "system.slice"
                properties []systemdDbus.Property
        )

        m.mu.Lock()
        defer m.mu.Unlock()

        if c.Parent != "" {
                slice = c.Parent
        }

        properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))

        if strings.HasSuffix(unitName, ".slice") {
                // If we create a slice, the parent is defined via a Wants=.
                properties = append(properties, systemdDbus.PropWants(slice))
        } else {
                // Otherwise it's a scope, which we put into a Slice=.
                properties = append(properties, systemdDbus.PropSlice(slice))
                // Assume scopes always support delegation (supported since systemd v218).
                properties = append(properties, newProp("Delegate", true))
        }

        // only add pid if its valid, -1 is used w/ general slice creation.
        if pid != -1 {
                properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
        }

        // Always enable accounting, this gets us the same behaviour as the fs implementation,
        // plus the kernel has some problems with joining the memory cgroup at a later time.
        properties = append(properties,
                newProp("MemoryAccounting", true),
                newProp("CPUAccounting", true),
                newProp("BlockIOAccounting", true),
                newProp("TasksAccounting", true),
        )

        // Assume DefaultDependencies= will always work (the check for it was previously broken.)
        properties = append(properties,
                newProp("DefaultDependencies", false))

        properties = append(properties, c.SystemdProps...)

        if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
                return err
        }

        if err := m.joinCgroups(pid); err != nil {
                return err
        }

        return nil
}

func (m *LegacyManager) Destroy() error {
        m.mu.Lock()
        defer m.mu.Unlock()

        stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))

        // Both on success and on error, cleanup all the cgroups
        // we are aware of, as some of them were created directly
        // by Apply() and are not managed by systemd.
        if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil {
                return err
        }

        return stopErr
}

func (m *LegacyManager) Path(subsys string) string {
        m.mu.Lock()
        defer m.mu.Unlock()
        return m.paths[subsys]
}

func (m *LegacyManager) joinCgroups(pid int) error {
        for _, sys := range legacySubsystems {
                name := sys.Name()
                switch name {
                case "name=systemd":
                        // let systemd handle this
                case "cpuset":
                        if path, ok := m.paths[name]; ok {
                                s := &fs.CpusetGroup{}
                                if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
                                        return err
                                }
                        }
                default:
                        if path, ok := m.paths[name]; ok {
                                if err := os.MkdirAll(path, 0o755); err != nil {
                                        return err
                                }
                                if err := cgroups.WriteCgroupProc(path, pid); err != nil {
                                        return err
                                }
                        }
                }
        }

        return nil
}

func getSubsystemPath(slice, unit, subsystem string) (string, error) {
        mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
        if err != nil {
                return "", err
        }

        return filepath.Join(mountpoint, slice, unit), nil
}

func (m *LegacyManager) Freeze(state cgroups.FreezerState) error {
        err := m.doFreeze(state)
        if err == nil {
                m.cgroups.Resources.Freezer = state
        }
        return err
}

// doFreeze is the same as Freeze but without
// changing the m.cgroups.Resources.Frozen field.
func (m *LegacyManager) doFreeze(state cgroups.FreezerState) error {
        path, ok := m.paths["freezer"]
        if !ok {
                return errSubsystemDoesNotExist
        }
        freezer := &fs.FreezerGroup{}
        resources := &cgroups.Resources{Freezer: state}
        return freezer.Set(path, resources)
}

func (m *LegacyManager) GetPids() ([]int, error) {
        path, ok := m.paths["devices"]
        if !ok {
                return nil, errSubsystemDoesNotExist
        }
        return cgroups.GetPids(path)
}

func (m *LegacyManager) GetAllPids() ([]int, error) {
        path, ok := m.paths["devices"]
        if !ok {
                return nil, errSubsystemDoesNotExist
        }
        return cgroups.GetAllPids(path)
}

func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
        m.mu.Lock()
        defer m.mu.Unlock()
        stats := cgroups.NewStats()
        for _, sys := range legacySubsystems {
                path := m.paths[sys.Name()]
                if path == "" {
                        continue
                }
                if err := sys.GetStats(path, stats); err != nil {
                        return nil, err
                }
        }

        return stats, nil
}

func (m *LegacyManager) Set(r *cgroups.Resources) error {
        if r == nil {
                return nil
        }
        if r.Unified != nil {
                return cgroups.ErrV1NoUnified
        }
        properties, err := genV1ResourcesProperties(r, m.dbus)
        if err != nil {
                return err
        }

        unitName := getUnitName(m.cgroups)
        needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r)
        if err != nil {
                return err
        }

        if needsFreeze {
                if err := m.doFreeze(cgroups.Frozen); err != nil {
                        // If freezer cgroup isn't supported, we just warn about it.
                        logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
                        // skip update the cgroup while frozen failed. #3803
                        if !errors.Is(err, errSubsystemDoesNotExist) {
                                if needsThaw {
                                        if thawErr := m.doFreeze(cgroups.Thawed); thawErr != nil {
                                                logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
                                        }
                                }
                                return err
                        }
                }
        }
        setErr := setUnitProperties(m.dbus, unitName, properties...)
        if needsThaw {
                if err := m.doFreeze(cgroups.Thawed); err != nil {
                        logrus.Infof("thaw container after SetUnitProperties failed: %v", err)
                }
        }
        if setErr != nil {
                return setErr
        }

        for _, sys := range legacySubsystems {
                // Get the subsystem path, but don't error out for not found cgroups.
                path, ok := m.paths[sys.Name()]
                if !ok {
                        continue
                }
                if err := sys.Set(path, r); err != nil {
                        return err
                }
        }

        return nil
}

func (m *LegacyManager) GetPaths() map[string]string {
        m.mu.Lock()
        defer m.mu.Unlock()
        return m.paths
}

func (m *LegacyManager) GetCgroups() (*cgroups.Cgroup, error) {
        return m.cgroups, nil
}

func (m *LegacyManager) GetFreezerState() (cgroups.FreezerState, error) {
        path, ok := m.paths["freezer"]
        if !ok {
                return cgroups.Undefined, nil
        }
        freezer := &fs.FreezerGroup{}
        return freezer.GetState(path)
}

func (m *LegacyManager) Exists() bool {
        return cgroups.PathExists(m.Path("devices"))
}

func (m *LegacyManager) OOMKillCount() (uint64, error) {
        return fs.OOMKillCount(m.Path("memory"))
}

package systemd

import (
        "bufio"
        "errors"
        "fmt"
        "math"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"

        systemdDbus "github.com/coreos/go-systemd/v22/dbus"
        securejoin "github.com/cyphar/filepath-securejoin"
        "github.com/sirupsen/logrus"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
)

const (
        cpuIdleSupportedVersion = 252
)

type UnifiedManager struct {
        mu      sync.Mutex
        cgroups *cgroups.Cgroup
        // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
        path  string
        dbus  *dbusConnManager
        fsMgr cgroups.Manager
}

func NewUnifiedManager(config *cgroups.Cgroup, path string) (*UnifiedManager, error) {
        m := &UnifiedManager{
                cgroups: config,
                path:    path,
                dbus:    newDbusConnManager(config.Rootless),
        }
        if err := m.initPath(); err != nil {
                return nil, err
        }

        fsMgr, err := fs2.NewManager(config, m.path)
        if err != nil {
                return nil, err
        }
        m.fsMgr = fsMgr

        return m, nil
}

func shouldSetCPUIdle(cm *dbusConnManager, v string) bool {
        // The only valid values for cpu.idle are 0 and 1. As it is
        // not possible to directly set cpu.idle to 0 via systemd,
        // ignore 0. Ignore other values as we'll error out later
        // in Set() while calling fsMgr.Set().
        return v == "1" && systemdVersion(cm) >= cpuIdleSupportedVersion
}

// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
// key/value map (where key is cgroupfs file name) to systemd unit properties.
// This is on a best-effort basis, so the properties that are not known
// (to this function and/or systemd) are ignored (but logged with "debug"
// log level).
//
// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
//
// For the list of systemd unit properties, see systemd.resource-control(5).
func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
        var err error

        for k, v := range res {
                if strings.Contains(k, "/") {
                        return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
                }
                if strings.IndexByte(k, '.') <= 0 {
                        return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
                }
                // Kernel is quite forgiving to extra whitespace
                // around the value, and so should we.
                v = strings.TrimSpace(v)
                // Please keep cases in alphabetical order.
                switch k {
                case "cpu.idle":
                        if shouldSetCPUIdle(cm, v) {
                                // Setting CPUWeight to 0 tells systemd
                                // to set cpu.idle to 1.
                                props = append(props,
                                        newProp("CPUWeight", uint64(0)))
                        }

                case "cpu.max":
                        // value: quota [period]
                        quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
                        period := defCPUQuotaPeriod
                        sv := strings.Fields(v)
                        if len(sv) < 1 || len(sv) > 2 {
                                return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
                        }
                        // quota
                        if sv[0] != "max" {
                                quota, err = strconv.ParseInt(sv[0], 10, 64)
                                if err != nil {
                                        return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
                                }
                        }
                        // period
                        if len(sv) == 2 {
                                period, err = strconv.ParseUint(sv[1], 10, 64)
                                if err != nil {
                                        return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
                                }
                        }
                        addCpuQuota(cm, &props, quota, period)

                case "cpu.weight":
                        if shouldSetCPUIdle(cm, strings.TrimSpace(res["cpu.idle"])) {
                                // Do not add duplicate CPUWeight property
                                // (see case "cpu.idle" above).
                                logrus.Warn("unable to apply both cpu.weight and cpu.idle to systemd, ignoring cpu.weight")
                                continue
                        }
                        num, err := strconv.ParseUint(v, 10, 64)
                        if err != nil {
                                return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
                        }
                        props = append(props,
                                newProp("CPUWeight", num))

                case "cpuset.cpus", "cpuset.mems":
                        bits, err := RangeToBits(v)
                        if err != nil {
                                return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
                        }
                        m := map[string]string{
                                "cpuset.cpus": "AllowedCPUs",
                                "cpuset.mems": "AllowedMemoryNodes",
                        }
                        // systemd only supports these properties since v244
                        sdVer := systemdVersion(cm)
                        if sdVer >= 244 {
                                props = append(props,
                                        newProp(m[k], bits))
                        } else {
                                logrus.Debugf("systemd v%d is too old to support %s"+
                                        " (setting will still be applied to cgroupfs)",
                                        sdVer, m[k])
                        }

                case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
                        num := uint64(math.MaxUint64)
                        if v != "max" {
                                num, err = strconv.ParseUint(v, 10, 64)
                                if err != nil {
                                        return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
                                }
                        }
                        m := map[string]string{
                                "memory.high":     "MemoryHigh",
                                "memory.low":      "MemoryLow",
                                "memory.min":      "MemoryMin",
                                "memory.max":      "MemoryMax",
                                "memory.swap.max": "MemorySwapMax",
                        }
                        props = append(props,
                                newProp(m[k], num))

                case "pids.max":
                        num := uint64(math.MaxUint64)
                        if v != "max" {
                                var err error
                                num, err = strconv.ParseUint(v, 10, 64)
                                if err != nil {
                                        return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
                                }
                        }
                        props = append(props,
                                newProp("TasksMax", num))

                case "memory.oom.group":
                        // Setting this to 1 is roughly equivalent to OOMPolicy=kill
                        // (as per systemd.service(5) and
                        // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
                        // but it's not clear what to do if it is unset or set
                        // to 0 in runc update, as there are two other possible
                        // values for OOMPolicy (continue/stop).
                        fallthrough

                default:
                        // Ignore the unknown resource here -- will still be
                        // applied in Set which calls fs2.Set.
                        logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
                }
        }

        return props, nil
}

func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
        // We need this check before setting systemd properties, otherwise
        // the container is OOM-killed and the systemd unit is removed
        // before we get to fsMgr.Set().
        if err := fs2.CheckMemoryUsage(dirPath, r); err != nil {
                return nil, err
        }

        var properties []systemdDbus.Property

        // NOTE: This is of questionable correctness because we insert our own
        //       devices eBPF program later. Two programs with identical rules
        //       aren't the end of the world, but it is a bit concerning. However
        //       it's unclear if systemd removes all eBPF programs attached when
        //       doing SetUnitProperties...
        deviceProperties, err := generateDeviceProperties(r, cm)
        if err != nil {
                return nil, err
        }
        properties = append(properties, deviceProperties...)

        if r.Memory != 0 {
                properties = append(properties,
                        newProp("MemoryMax", uint64(r.Memory)))
        }
        if r.MemoryReservation != 0 {
                properties = append(properties,
                        newProp("MemoryLow", uint64(r.MemoryReservation)))
        }

        swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
        if err != nil {
                return nil, err
        }
        if swap != 0 {
                properties = append(properties,
                        newProp("MemorySwapMax", uint64(swap)))
        }

        idleSet := false
        // The logic here is the same as in shouldSetCPUIdle.
        if r.CPUIdle != nil && *r.CPUIdle == 1 && systemdVersion(cm) >= cpuIdleSupportedVersion {
                properties = append(properties,
                        newProp("CPUWeight", uint64(0)))
                idleSet = true
        }
        if r.CpuWeight != 0 {
                if idleSet {
                        // Ignore CpuWeight if CPUIdle is already set.
                        logrus.Warn("unable to apply both CPUWeight and CpuIdle to systemd, ignoring CPUWeight")
                } else {
                        properties = append(properties,
                                newProp("CPUWeight", r.CpuWeight))
                }
        }

        addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)

        if r.PidsLimit > 0 || r.PidsLimit == -1 {
                properties = append(properties,
                        newProp("TasksMax", uint64(r.PidsLimit)))
        }

        err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
        if err != nil {
                return nil, err
        }

        // ignore r.KernelMemory

        // convert Resources.Unified map to systemd properties
        if r.Unified != nil {
                unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
                if err != nil {
                        return nil, err
                }
                properties = append(properties, unifiedProps...)
        }

        return properties, nil
}

func (m *UnifiedManager) Apply(pid int) error {
        var (
                c          = m.cgroups
                unitName   = getUnitName(c)
                properties []systemdDbus.Property
        )

        slice := "system.slice"
        if m.cgroups.Rootless {
                slice = "user.slice"
        }
        if c.Parent != "" {
                slice = c.Parent
        }

        properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))

        if strings.HasSuffix(unitName, ".slice") {
                // If we create a slice, the parent is defined via a Wants=.
                properties = append(properties, systemdDbus.PropWants(slice))
        } else {
                // Otherwise it's a scope, which we put into a Slice=.
                properties = append(properties, systemdDbus.PropSlice(slice))
                // Assume scopes always support delegation (supported since systemd v218).
                properties = append(properties, newProp("Delegate", true))
        }

        // only add pid if its valid, -1 is used w/ general slice creation.
        if pid != -1 {
                properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
        }

        // Always enable accounting, this gets us the same behaviour as the fs implementation,
        // plus the kernel has some problems with joining the memory cgroup at a later time.
        properties = append(properties,
                newProp("MemoryAccounting", true),
                newProp("CPUAccounting", true),
                newProp("IOAccounting", true),
                newProp("TasksAccounting", true),
        )

        // Assume DefaultDependencies= will always work (the check for it was previously broken.)
        properties = append(properties,
                newProp("DefaultDependencies", false))

        properties = append(properties, c.SystemdProps...)

        if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
                return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
        }

        if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
                return err
        }

        if c.OwnerUID != nil {
                // The directory itself must be chowned.
                err := os.Chown(m.path, *c.OwnerUID, -1)
                if err != nil {
                        return err
                }

                filesToChown, err := cgroupFilesToChown()
                if err != nil {
                        return err
                }

                for _, v := range filesToChown {
                        err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
                        // Some files might not be present.
                        if err != nil && !errors.Is(err, os.ErrNotExist) {
                                return err
                        }
                }
        }

        return nil
}

// The kernel exposes a list of files that should be chowned to the delegate
// uid in /sys/kernel/cgroup/delegate.  If the file is not present
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
func cgroupFilesToChown() ([]string, error) {
        const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"

        f, err := os.Open(cgroupDelegateFile)
        if err != nil {
                return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
        }
        defer f.Close()

        filesToChown := []string{}
        scanner := bufio.NewScanner(f)
        for scanner.Scan() {
                filesToChown = append(filesToChown, scanner.Text())
        }
        if err := scanner.Err(); err != nil {
                return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
        }

        return filesToChown, nil
}

func (m *UnifiedManager) Destroy() error {
        m.mu.Lock()
        defer m.mu.Unlock()

        unitName := getUnitName(m.cgroups)
        if err := stopUnit(m.dbus, unitName); err != nil {
                return err
        }

        // systemd 239 do not remove sub-cgroups.
        err := m.fsMgr.Destroy()
        // fsMgr.Destroy has handled ErrNotExist
        if err != nil {
                return err
        }

        return nil
}

func (m *UnifiedManager) Path(_ string) string {
        return m.path
}

// getSliceFull value is used in initPath.
// The value is incompatible with systemdDbus.PropSlice.
func (m *UnifiedManager) getSliceFull() (string, error) {
        c := m.cgroups
        slice := "system.slice"
        if c.Rootless {
                slice = "user.slice"
        }
        if c.Parent != "" {
                var err error
                slice, err = ExpandSlice(c.Parent)
                if err != nil {
                        return "", err
                }
        }

        if c.Rootless {
                // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
                managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
                if err != nil {
                        return "", err
                }
                slice = filepath.Join(managerCG, slice)
        }

        // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
        // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
        return slice, nil
}

func (m *UnifiedManager) initPath() error {
        if m.path != "" {
                return nil
        }

        sliceFull, err := m.getSliceFull()
        if err != nil {
                return err
        }

        c := m.cgroups
        path := filepath.Join(sliceFull, getUnitName(c))
        path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
        if err != nil {
                return err
        }

        // an example of the final path in rootless:
        // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
        m.path = path

        return nil
}

func (m *UnifiedManager) Freeze(state cgroups.FreezerState) error {
        return m.fsMgr.Freeze(state)
}

func (m *UnifiedManager) GetPids() ([]int, error) {
        return cgroups.GetPids(m.path)
}

func (m *UnifiedManager) GetAllPids() ([]int, error) {
        return cgroups.GetAllPids(m.path)
}

func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
        return m.fsMgr.GetStats()
}

func (m *UnifiedManager) Set(r *cgroups.Resources) error {
        if r == nil {
                return nil
        }
        properties, err := genV2ResourcesProperties(m.fsMgr.Path(""), r, m.dbus)
        if err != nil {
                return err
        }

        if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
                return fmt.Errorf("unable to set unit properties: %w", err)
        }

        return m.fsMgr.Set(r)
}

func (m *UnifiedManager) GetPaths() map[string]string {
        paths := make(map[string]string, 1)
        paths[""] = m.path
        return paths
}

func (m *UnifiedManager) GetCgroups() (*cgroups.Cgroup, error) {
        return m.cgroups, nil
}

func (m *UnifiedManager) GetFreezerState() (cgroups.FreezerState, error) {
        return m.fsMgr.GetFreezerState()
}

func (m *UnifiedManager) Exists() bool {
        return cgroups.PathExists(m.path)
}

func (m *UnifiedManager) OOMKillCount() (uint64, error) {
        return m.fsMgr.OOMKillCount()
}

package cgroups

import (
        "bufio"
        "errors"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/moby/sys/userns"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

const (
        CgroupProcesses   = "cgroup.procs"
        unifiedMountpoint = "/sys/fs/cgroup"
        hybridMountpoint  = "/sys/fs/cgroup/unified"
)

var (
        isUnifiedOnce sync.Once
        isUnified     bool
        isHybridOnce  sync.Once
        isHybrid      bool
)

// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
func IsCgroup2UnifiedMode() bool {
        isUnifiedOnce.Do(func() {
                var st unix.Statfs_t
                err := unix.Statfs(unifiedMountpoint, &st)
                if err != nil {
                        level := logrus.WarnLevel
                        if os.IsNotExist(err) && userns.RunningInUserNS() {
                                // For rootless containers, sweep it under the rug.
                                level = logrus.DebugLevel
                        }
                        logrus.StandardLogger().Logf(level,
                                "statfs %s: %v; assuming cgroup v1", unifiedMountpoint, err)
                }
                isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
        })
        return isUnified
}

// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
func IsCgroup2HybridMode() bool {
        isHybridOnce.Do(func() {
                var st unix.Statfs_t
                err := unix.Statfs(hybridMountpoint, &st)
                if err != nil {
                        isHybrid = false
                        if !os.IsNotExist(err) {
                                // Report unexpected errors.
                                logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
                        }
                        return
                }
                isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
        })
        return isHybrid
}

type Mount struct {
        Mountpoint string
        Root       string
        Subsystems []string
}

// GetCgroupMounts returns the mounts for the cgroup subsystems.
// all indicates whether to return just the first instance or all the mounts.
// This function should not be used from cgroupv2 code, as in this case
// all the controllers are available under the constant unifiedMountpoint.
func GetCgroupMounts(all bool) ([]Mount, error) {
        if IsCgroup2UnifiedMode() {
                // TODO: remove cgroupv2 case once all external users are converted
                availableControllers, err := GetAllSubsystems()
                if err != nil {
                        return nil, err
                }
                m := Mount{
                        Mountpoint: unifiedMountpoint,
                        Root:       unifiedMountpoint,
                        Subsystems: availableControllers,
                }
                return []Mount{m}, nil
        }

        return getCgroupMountsV1(all)
}

// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) {
        // /proc/cgroups is meaningless for v2
        // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
        if IsCgroup2UnifiedMode() {
                // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
                // - devices: implemented in kernel 4.15
                // - freezer: implemented in kernel 5.2
                // We assume these are always available, as it is hard to detect availability.
                pseudo := []string{"devices", "freezer"}
                data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
                if err != nil {
                        return nil, err
                }
                subsystems := append(pseudo, strings.Fields(data)...)
                return subsystems, nil
        }
        f, err := os.Open("/proc/cgroups")
        if err != nil {
                return nil, err
        }
        defer f.Close()

        subsystems := []string{}

        s := bufio.NewScanner(f)
        for s.Scan() {
                text := s.Text()
                if text[0] != '#' {
                        parts := strings.Fields(text)
                        if len(parts) >= 4 && parts[3] != "0" {
                                subsystems = append(subsystems, parts[0])
                        }
                }
        }
        if err := s.Err(); err != nil {
                return nil, err
        }
        return subsystems, nil
}

func readProcsFile(dir string) (out []int, _ error) {
        file := CgroupProcesses
        retry := true

again:
        f, err := OpenFile(dir, file, os.O_RDONLY)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        s := bufio.NewScanner(f)
        for s.Scan() {
                if t := s.Text(); t != "" {
                        pid, err := strconv.Atoi(t)
                        if err != nil {
                                return nil, err
                        }
                        out = append(out, pid)
                }
        }
        if errors.Is(s.Err(), unix.ENOTSUP) && retry {
                // For a threaded cgroup, read returns ENOTSUP, and we should
                // read from cgroup.threads instead.
                file = "cgroup.threads"
                retry = false
                goto again
        }
        return out, s.Err()
}

// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
//
//        "cpu": "/user.slice/user-1000.slice"
//        "pids": "/user.slice/user-1000.slice"
//
// etc.
//
// Note that for cgroup v2 unified hierarchy, there are no per-controller
// cgroup paths, so the resulting map will have a single element where the key
// is empty string ("") and the value is the cgroup path the <pid> is in.
func ParseCgroupFile(path string) (map[string]string, error) {
        f, err := os.Open(path)
        if err != nil {
                return nil, err
        }
        defer f.Close()

        return parseCgroupFromReader(f)
}

// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
        s := bufio.NewScanner(r)
        cgroups := make(map[string]string)

        for s.Scan() {
                text := s.Text()
                // from cgroups(7):
                // /proc/[pid]/cgroup
                // ...
                // For each cgroup hierarchy ... there is one entry
                // containing three colon-separated fields of the form:
                //     hierarchy-ID:subsystem-list:cgroup-path
                parts := strings.SplitN(text, ":", 3)
                if len(parts) < 3 {
                        return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
                }

                for _, subs := range strings.Split(parts[1], ",") {
                        cgroups[subs] = parts[2]
                }
        }
        if err := s.Err(); err != nil {
                return nil, err
        }

        return cgroups, nil
}

func PathExists(path string) bool {
        if _, err := os.Stat(path); err != nil {
                return false
        }
        return true
}

// rmdir tries to remove a directory, optionally retrying on EBUSY.
func rmdir(path string, retry bool) error {
        delay := time.Millisecond
        tries := 10

again:
        err := unix.Rmdir(path)
        switch err { // nolint:errorlint // unix errors are bare
        case nil, unix.ENOENT:
                return nil
        case unix.EINTR:
                goto again
        case unix.EBUSY:
                if retry && tries > 0 {
                        time.Sleep(delay)
                        delay *= 2
                        tries--
                        goto again

                }
        }
        return &os.PathError{Op: "rmdir", Path: path, Err: err}
}

// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
        // Try the fast path first; don't retry on EBUSY yet.
        if err := rmdir(path, false); err == nil {
                return nil
        }

        // There are many reasons why rmdir can fail, including:
        // 1. cgroup have existing sub-cgroups;
        // 2. cgroup (still) have some processes (that are about to vanish);
        // 3. lack of permission (one example is read-only /sys/fs/cgroup mount,
        //    in which case rmdir returns EROFS even for for a non-existent path,
        //    see issue 4518).
        //
        // Using os.ReadDir here kills two birds with one stone: check if
        // the directory exists (handling scenario 3 above), and use
        // directory contents to remove sub-cgroups (handling scenario 1).
        infos, err := os.ReadDir(path)
        if err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
                return err
        }
        // Let's remove sub-cgroups, if any.
        for _, info := range infos {
                if info.IsDir() {
                        if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
                                return err
                        }
                }
        }
        // Finally, try rmdir again, this time with retries on EBUSY,
        // which may help with scenario 2 above.
        return rmdir(path, true)
}

// RemovePaths iterates over the provided paths removing them.
func RemovePaths(paths map[string]string) (err error) {
        for s, p := range paths {
                if err := RemovePath(p); err == nil {
                        delete(paths, s)
                }
        }
        if len(paths) == 0 {
                clear(paths)
                return nil
        }
        return fmt.Errorf("Failed to remove paths: %v", paths)
}

var (
        hugePageSizes []string
        initHPSOnce   sync.Once
)

func HugePageSizes() []string {
        initHPSOnce.Do(func() {
                dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
                if err != nil {
                        return
                }
                files, err := dir.Readdirnames(0)
                dir.Close()
                if err != nil {
                        return
                }

                hugePageSizes, err = getHugePageSizeFromFilenames(files)
                if err != nil {
                        logrus.Warn("HugePageSizes: ", err)
                }
        })

        return hugePageSizes
}

func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
        pageSizes := make([]string, 0, len(fileNames))
        var warn error

        for _, file := range fileNames {
                // example: hugepages-1048576kB
                val, ok := strings.CutPrefix(file, "hugepages-")
                if !ok {
                        // Unexpected file name: no prefix found, ignore it.
                        continue
                }
                // The suffix is always "kB" (as of Linux 5.13). If we find
                // something else, produce an error but keep going.
                eLen := len(val) - 2
                val = strings.TrimSuffix(val, "kB")
                if len(val) != eLen {
                        // Highly unlikely.
                        if warn == nil {
                                warn = errors.New(file + `: invalid suffix (expected "kB")`)
                        }
                        continue
                }
                size, err := strconv.Atoi(val)
                if err != nil {
                        // Highly unlikely.
                        if warn == nil {
                                warn = fmt.Errorf("%s: %w", file, err)
                        }
                        continue
                }
                // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
                // but in our case the size is in KB already.
                if size >= (1 << 20) {
                        val = strconv.Itoa(size>>20) + "GB"
                } else if size >= (1 << 10) {
                        val = strconv.Itoa(size>>10) + "MB"
                } else {
                        val += "KB"
                }
                pageSizes = append(pageSizes, val)
        }

        return pageSizes, warn
}

// GetPids returns all pids, that were added to cgroup at path.
func GetPids(dir string) ([]int, error) {
        return readProcsFile(dir)
}

// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
func WriteCgroupProc(dir string, pid int) error {
        // Normally dir should not be empty, one case is that cgroup subsystem
        // is not mounted, we will get empty dir, and we want it fail here.
        if dir == "" {
                return fmt.Errorf("no such directory for %s", CgroupProcesses)
        }

        // Dont attach any pid to the cgroup if -1 is specified as a pid
        if pid == -1 {
                return nil
        }

        file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
        if err != nil {
                return fmt.Errorf("failed to write %v: %w", pid, err)
        }
        defer file.Close()

        for i := 0; i < 5; i++ {
                _, err = file.WriteString(strconv.Itoa(pid))
                if err == nil {
                        return nil
                }

                // EINVAL might mean that the task being added to cgroup.procs is in state
                // TASK_NEW. We should attempt to do so again.
                if errors.Is(err, unix.EINVAL) {
                        time.Sleep(30 * time.Millisecond)
                        continue
                }

                return fmt.Errorf("failed to write %v: %w", pid, err)
        }
        return err
}

// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
// convert from [2-262144] to [1-10000]
// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
        if cpuShares == 0 {
                return 0
        }
        return (1 + ((cpuShares-2)*9999)/262142)
}

// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
// is defined as memory+swap combined, while in cgroup v2 swap is a separate value,
// so we need to subtract memory from it where it makes sense.
func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
        switch {
        case memory == -1 && memorySwap == 0:
                // For compatibility with cgroup1 controller, set swap to unlimited in
                // case the memory is set to unlimited and the swap is not explicitly set,
                // treating the request as "set both memory and swap to unlimited".
                return -1, nil
        case memorySwap == -1, memorySwap == 0:
                // Treat -1 ("max") and 0 ("unset") swap as is.
                return memorySwap, nil
        case memory == -1:
                // Unlimited memory, so treat swap as is.
                return memorySwap, nil
        case memory == 0:
                // Unset or unknown memory, can't calculate swap.
                return 0, errors.New("unable to set swap limit without memory limit")
        case memory < 0:
                // Does not make sense to subtract a negative value.
                return 0, fmt.Errorf("invalid memory value: %d", memory)
        case memorySwap < memory:
                // Sanity check.
                return 0, errors.New("memory+swap limit should be >= memory limit")
        }

        return memorySwap - memory, nil
}

// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
        if blkIoWeight == 0 {
                return 0
        }
        return 1 + (uint64(blkIoWeight)-10)*9999/990
}

package cgroups

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "strings"
        "sync"
        "syscall"

        securejoin "github.com/cyphar/filepath-securejoin"
        "github.com/moby/sys/mountinfo"
        "golang.org/x/sys/unix"
)

// Code in this source file are specific to cgroup v1,
// and must not be used from any cgroup v2 code.

const (
        CgroupNamePrefix = "name="
        defaultPrefix    = "/sys/fs/cgroup"
)

var (
        errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
        ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")

        readMountinfoOnce sync.Once
        readMountinfoErr  error
        cgroupMountinfo   []*mountinfo.Info
)

type NotFoundError struct {
        Subsystem string
}

func (e *NotFoundError) Error() string {
        return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
}

func NewNotFoundError(sub string) error {
        return &NotFoundError{
                Subsystem: sub,
        }
}

func IsNotFound(err error) bool {
        var nfErr *NotFoundError
        return errors.As(err, &nfErr)
}

func tryDefaultPath(cgroupPath, subsystem string) string {
        if !strings.HasPrefix(defaultPrefix, cgroupPath) {
                return ""
        }

        // remove possible prefix
        subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix)

        // Make sure we're still under defaultPrefix, and resolve
        // a possible symlink (like cpu -> cpu,cpuacct).
        path, err := securejoin.SecureJoin(defaultPrefix, subsystem)
        if err != nil {
                return ""
        }

        // (1) path should be a directory.
        st, err := os.Lstat(path)
        if err != nil || !st.IsDir() {
                return ""
        }

        // (2) path should be a mount point.
        pst, err := os.Lstat(filepath.Dir(path))
        if err != nil {
                return ""
        }

        if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev {
                // parent dir has the same dev -- path is not a mount point
                return ""
        }

        // (3) path should have 'cgroup' fs type.
        fst := unix.Statfs_t{}
        err = unix.Statfs(path, &fst)
        if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
                return ""
        }

        return path
}

// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
// with fstype of "cgroup") for the current running process.
//
// The results are cached (to avoid re-reading mountinfo which is relatively
// expensive), so it is assumed that cgroup mounts are not being changed.
func readCgroupMountinfo() ([]*mountinfo.Info, error) {
        readMountinfoOnce.Do(func() {
                // mountinfo.GetMounts uses /proc/thread-self, so we can use it without
                // issues.
                cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
                        mountinfo.FSTypeFilter("cgroup"),
                )
        })
        return cgroupMountinfo, readMountinfoErr
}

// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
        if IsCgroup2UnifiedMode() {
                return "", errUnified
        }

        // If subsystem is empty, we look for the cgroupv2 hybrid path.
        if len(subsystem) == 0 {
                return hybridMountpoint, nil
        }

        // Avoid parsing mountinfo by trying the default path first, if possible.
        if path := tryDefaultPath(cgroupPath, subsystem); path != "" {
                return path, nil
        }

        mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
        return mnt, err
}

func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
        if IsCgroup2UnifiedMode() {
                return "", "", errUnified
        }

        mi, err := readCgroupMountinfo()
        if err != nil {
                return "", "", err
        }

        return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
}

func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
        for _, mi := range mounts {
                if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
                        for _, opt := range strings.Split(mi.VFSOptions, ",") {
                                if opt == subsystem {
                                        return mi.Mountpoint, mi.Root, nil
                                }
                        }
                }
        }

        return "", "", NewNotFoundError(subsystem)
}

func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
        if len(m.Subsystems) == 0 {
                return "", errors.New("no subsystem for mount")
        }

        return getControllerPath(m.Subsystems[0], cgroups)
}

func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
        res := make([]Mount, 0, len(ss))
        numFound := 0
        for _, mi := range mounts {
                m := Mount{
                        Mountpoint: mi.Mountpoint,
                        Root:       mi.Root,
                }
                for _, opt := range strings.Split(mi.VFSOptions, ",") {
                        seen, known := ss[opt]
                        if !known || (!all && seen) {
                                continue
                        }
                        ss[opt] = true
                        opt = strings.TrimPrefix(opt, CgroupNamePrefix)
                        m.Subsystems = append(m.Subsystems, opt)
                        numFound++
                }
                if len(m.Subsystems) > 0 || all {
                        res = append(res, m)
                }
                if !all && numFound >= len(ss) {
                        break
                }
        }
        return res, nil
}

func getCgroupMountsV1(all bool) ([]Mount, error) {
        mi, err := readCgroupMountinfo()
        if err != nil {
                return nil, err
        }

        // We don't need to use /proc/thread-self here because runc always runs
        // with every thread in the same cgroup. This lets us avoid having to do
        // runtime.LockOSThread.
        allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
        if err != nil {
                return nil, err
        }

        allMap := make(map[string]bool)
        for s := range allSubsystems {
                allMap[s] = false
        }

        return getCgroupMountsHelper(allMap, mi, all)
}

// GetOwnCgroup returns the relative path to the cgroup docker is running in.
func GetOwnCgroup(subsystem string) (string, error) {
        if IsCgroup2UnifiedMode() {
                return "", errUnified
        }

        // We don't need to use /proc/thread-self here because runc always runs
        // with every thread in the same cgroup. This lets us avoid having to do
        // runtime.LockOSThread.
        cgroups, err := ParseCgroupFile("/proc/self/cgroup")
        if err != nil {
                return "", err
        }

        return getControllerPath(subsystem, cgroups)
}

func GetOwnCgroupPath(subsystem string) (string, error) {
        cgroup, err := GetOwnCgroup(subsystem)
        if err != nil {
                return "", err
        }

        // If subsystem is empty, we look for the cgroupv2 hybrid path.
        if len(subsystem) == 0 {
                return hybridMountpoint, nil
        }

        return getCgroupPathHelper(subsystem, cgroup)
}

func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
        mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
        if err != nil {
                return "", err
        }

        // This is needed for nested containers, because in /proc/self/cgroup we
        // see paths from host, which don't exist in container.
        relCgroup, err := filepath.Rel(root, cgroup)
        if err != nil {
                return "", err
        }

        return filepath.Join(mnt, relCgroup), nil
}

func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
        if IsCgroup2UnifiedMode() {
                return "", errUnified
        }

        if p, ok := cgroups[subsystem]; ok {
                return p, nil
        }

        if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
                return p, nil
        }

        return "", NewNotFoundError(subsystem)
}

package configs

import (
        "bytes"
        "encoding/json"
        "fmt"
        "os/exec"
        "time"

        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
        "github.com/opencontainers/runtime-spec/specs-go"
)

type Rlimit struct {
        Type int    `json:"type"`
        Hard uint64 `json:"hard"`
        Soft uint64 `json:"soft"`
}

// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
        ContainerID int64 `json:"container_id"`
        HostID      int64 `json:"host_id"`
        Size        int64 `json:"size"`
}

// Seccomp represents syscall restrictions
// By default, only the native architecture of the kernel is allowed to be used
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
        DefaultAction    Action                   `json:"default_action"`
        Architectures    []string                 `json:"architectures"`
        Flags            []specs.LinuxSeccompFlag `json:"flags"`
        Syscalls         []*Syscall               `json:"syscalls"`
        DefaultErrnoRet  *uint                    `json:"default_errno_ret"`
        ListenerPath     string                   `json:"listener_path,omitempty"`
        ListenerMetadata string                   `json:"listener_metadata,omitempty"`
}

// Action is taken upon rule match in Seccomp
type Action int

const (
        Kill Action = iota + 1
        Errno
        Trap
        Allow
        Trace
        Log
        Notify
        KillThread
        KillProcess
)

// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int

const (
        EqualTo Operator = iota + 1
        NotEqualTo
        GreaterThan
        GreaterThanOrEqualTo
        LessThan
        LessThanOrEqualTo
        MaskEqualTo
)

// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
        Index    uint     `json:"index"`
        Value    uint64   `json:"value"`
        ValueTwo uint64   `json:"value_two"`
        Op       Operator `json:"op"`
}

// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
        Name     string `json:"name"`
        Action   Action `json:"action"`
        ErrnoRet *uint  `json:"errnoRet"`
        Args     []*Arg `json:"args"`
}

// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
        // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
        // This is a common option when the container is running in ramdisk
        NoPivotRoot bool `json:"no_pivot_root"`

        // ParentDeathSignal specifies the signal that is sent to the container's process in the case
        // that the parent process dies.
        ParentDeathSignal int `json:"parent_death_signal"`

        // Path to a directory containing the container's root filesystem.
        Rootfs string `json:"rootfs"`

        // Umask is the umask to use inside of the container.
        Umask *uint32 `json:"umask"`

        // Readonlyfs will remount the container's rootfs as readonly where only externally mounted
        // bind mounts are writtable.
        Readonlyfs bool `json:"readonlyfs"`

        // Specifies the mount propagation flags to be applied to /.
        RootPropagation int `json:"rootPropagation"`

        // Mounts specify additional source and destination paths that will be mounted inside the container's
        // rootfs and mount namespace if specified
        Mounts []*Mount `json:"mounts"`

        // The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
        Devices []*devices.Device `json:"devices"`

        MountLabel string `json:"mount_label"`

        // Hostname optionally sets the container's hostname if provided
        Hostname string `json:"hostname"`

        // Domainname optionally sets the container's domainname if provided
        Domainname string `json:"domainname"`

        // Namespaces specifies the container's namespaces that it should setup when cloning the init process
        // If a namespace is not provided that namespace is shared from the container's parent process
        Namespaces Namespaces `json:"namespaces"`

        // Capabilities specify the capabilities to keep when executing the process inside the container
        // All capabilities not specified will be dropped from the processes capability mask
        Capabilities *Capabilities `json:"capabilities"`

        // Networks specifies the container's network setup to be created
        Networks []*Network `json:"networks"`

        // Routes can be specified to create entries in the route table as the container is started
        Routes []*Route `json:"routes"`

        // Cgroups specifies specific cgroup settings for the various subsystems that the container is
        // placed into to limit the resources the container has available
        Cgroups *Cgroup `json:"cgroups"`

        // AppArmorProfile specifies the profile to apply to the process running in the container and is
        // change at the time the process is execed
        AppArmorProfile string `json:"apparmor_profile,omitempty"`

        // ProcessLabel specifies the label to apply to the process running in the container.  It is
        // commonly used by selinux
        ProcessLabel string `json:"process_label,omitempty"`

        // Rlimits specifies the resource limits, such as max open files, to set in the container
        // If Rlimits are not set, the container will inherit rlimits from the parent process
        Rlimits []Rlimit `json:"rlimits,omitempty"`

        // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
        // for a process. Valid values are between the range [-1000, '1000'], where processes with
        // higher scores are preferred for being killed. If it is unset then we don't touch the current
        // value.
        // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
        OomScoreAdj *int `json:"oom_score_adj,omitempty"`

        // UIDMappings is an array of User ID mappings for User Namespaces
        UIDMappings []IDMap `json:"uid_mappings"`

        // GIDMappings is an array of Group ID mappings for User Namespaces
        GIDMappings []IDMap `json:"gid_mappings"`

        // MaskPaths specifies paths within the container's rootfs to mask over with a bind
        // mount pointing to /dev/null as to prevent reads of the file.
        MaskPaths []string `json:"mask_paths"`

        // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
        // so that these files prevent any writes.
        ReadonlyPaths []string `json:"readonly_paths"`

        // Sysctl is a map of properties and their values. It is the equivalent of using
        // sysctl -w my.property.name value in Linux.
        Sysctl map[string]string `json:"sysctl"`

        // Seccomp allows actions to be taken whenever a syscall is made within the container.
        // A number of rules are given, each having an action to be taken if a syscall matches it.
        // A default action to be taken if no rules match is also given.
        Seccomp *Seccomp `json:"seccomp"`

        // NoNewPrivileges controls whether processes in the container can gain additional privileges.
        NoNewPrivileges bool `json:"no_new_privileges,omitempty"`

        // Hooks are a collection of actions to perform at various container lifecycle events.
        // CommandHooks are serialized to JSON, but other hooks are not.
        Hooks Hooks

        // Version is the version of opencontainer specification that is supported.
        Version string `json:"version"`

        // Labels are user defined metadata that is stored in the config and populated on the state
        Labels []string `json:"labels"`

        // NoNewKeyring will not allocated a new session keyring for the container.  It will use the
        // callers keyring in this case.
        NoNewKeyring bool `json:"no_new_keyring"`

        // IntelRdt specifies settings for Intel RDT group that the container is placed into
        // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
        IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`

        // RootlessEUID is set when the runc was launched with non-zero EUID.
        // Note that RootlessEUID is set to false when launched with EUID=0 in userns.
        // When RootlessEUID is set, runc creates a new userns for the container.
        // (config.json needs to contain userns settings)
        RootlessEUID bool `json:"rootless_euid,omitempty"`

        // RootlessCgroups is set when unlikely to have the full access to cgroups.
        // When RootlessCgroups is set, cgroups errors are ignored.
        RootlessCgroups bool `json:"rootless_cgroups,omitempty"`

        // TimeOffsets specifies the offset for supporting time namespaces.
        TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"`

        // Scheduler represents the scheduling attributes for a process.
        Scheduler *Scheduler `json:"scheduler,omitempty"`

        // Personality contains configuration for the Linux personality syscall.
        Personality *LinuxPersonality `json:"personality,omitempty"`

        // IOPriority is the container's I/O priority.
        IOPriority *IOPriority `json:"io_priority,omitempty"`
}

// Scheduler is based on the Linux sched_setattr(2) syscall.
type Scheduler = specs.Scheduler

// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr.
func ToSchedAttr(scheduler *Scheduler) (*unix.SchedAttr, error) {
        var policy uint32
        switch scheduler.Policy {
        case specs.SchedOther:
                policy = 0
        case specs.SchedFIFO:
                policy = 1
        case specs.SchedRR:
                policy = 2
        case specs.SchedBatch:
                policy = 3
        case specs.SchedISO:
                policy = 4
        case specs.SchedIdle:
                policy = 5
        case specs.SchedDeadline:
                policy = 6
        default:
                return nil, fmt.Errorf("invalid scheduler policy: %s", scheduler.Policy)
        }

        var flags uint64
        for _, flag := range scheduler.Flags {
                switch flag {
                case specs.SchedFlagResetOnFork:
                        flags |= 0x01
                case specs.SchedFlagReclaim:
                        flags |= 0x02
                case specs.SchedFlagDLOverrun:
                        flags |= 0x04
                case specs.SchedFlagKeepPolicy:
                        flags |= 0x08
                case specs.SchedFlagKeepParams:
                        flags |= 0x10
                case specs.SchedFlagUtilClampMin:
                        flags |= 0x20
                case specs.SchedFlagUtilClampMax:
                        flags |= 0x40
                default:
                        return nil, fmt.Errorf("invalid scheduler flag: %s", flag)
                }
        }

        return &unix.SchedAttr{
                Size:     unix.SizeofSchedAttr,
                Policy:   policy,
                Flags:    flags,
                Nice:     scheduler.Nice,
                Priority: uint32(scheduler.Priority),
                Runtime:  scheduler.Runtime,
                Deadline: scheduler.Deadline,
                Period:   scheduler.Period,
        }, nil
}

type IOPriority = specs.LinuxIOPriority

type (
        HookName string
        HookList []Hook
        Hooks    map[HookName]HookList
)

const (
        // Prestart commands are executed after the container namespaces are created,
        // but before the user supplied command is executed from init.
        // Note: This hook is now deprecated
        // Prestart commands are called in the Runtime namespace.
        Prestart HookName = "prestart"

        // CreateRuntime commands MUST be called as part of the create operation after
        // the runtime environment has been created but before the pivot_root has been executed.
        // CreateRuntime is called immediately after the deprecated Prestart hook.
        // CreateRuntime commands are called in the Runtime Namespace.
        CreateRuntime HookName = "createRuntime"

        // CreateContainer commands MUST be called as part of the create operation after
        // the runtime environment has been created but before the pivot_root has been executed.
        // CreateContainer commands are called in the Container namespace.
        CreateContainer HookName = "createContainer"

        // StartContainer commands MUST be called as part of the start operation and before
        // the container process is started.
        // StartContainer commands are called in the Container namespace.
        StartContainer HookName = "startContainer"

        // Poststart commands are executed after the container init process starts.
        // Poststart commands are called in the Runtime Namespace.
        Poststart HookName = "poststart"

        // Poststop commands are executed after the container init process exits.
        // Poststop commands are called in the Runtime Namespace.
        Poststop HookName = "poststop"
)

// HasHook checks if config has any hooks with any given names configured.
func (c *Config) HasHook(names ...HookName) bool {
        if c.Hooks == nil {
                return false
        }
        for _, h := range names {
                if len(c.Hooks[h]) > 0 {
                        return true
                }
        }
        return false
}

// KnownHookNames returns the known hook names.
// Used by `runc features`.
func KnownHookNames() []string {
        return []string{
                string(Prestart), // deprecated
                string(CreateRuntime),
                string(CreateContainer),
                string(StartContainer),
                string(Poststart),
                string(Poststop),
        }
}

type Capabilities struct {
        // Bounding is the set of capabilities checked by the kernel.
        Bounding []string
        // Effective is the set of capabilities checked by the kernel.
        Effective []string
        // Inheritable is the capabilities preserved across execve.
        Inheritable []string
        // Permitted is the limiting superset for effective capabilities.
        Permitted []string
        // Ambient is the ambient set of capabilities that are kept.
        Ambient []string
}

// Deprecated: use (Hooks).Run instead.
func (hooks HookList) RunHooks(state *specs.State) error {
        for i, h := range hooks {
                if err := h.Run(state); err != nil {
                        return fmt.Errorf("error running hook #%d: %w", i, err)
                }
        }

        return nil
}

func (hooks *Hooks) UnmarshalJSON(b []byte) error {
        var state map[HookName][]CommandHook

        if err := json.Unmarshal(b, &state); err != nil {
                return err
        }

        *hooks = Hooks{}
        for n, commandHooks := range state {
                if len(commandHooks) == 0 {
                        continue
                }

                (*hooks)[n] = HookList{}
                for _, h := range commandHooks {
                        (*hooks)[n] = append((*hooks)[n], h)
                }
        }

        return nil
}

func (hooks *Hooks) MarshalJSON() ([]byte, error) {
        serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
                for _, hook := range hooks {
                        switch chook := hook.(type) {
                        case CommandHook:
                                serializableHooks = append(serializableHooks, chook)
                        default:
                                logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
                        }
                }

                return serializableHooks
        }

        return json.Marshal(map[string]interface{}{
                "prestart":        serialize((*hooks)[Prestart]),
                "createRuntime":   serialize((*hooks)[CreateRuntime]),
                "createContainer": serialize((*hooks)[CreateContainer]),
                "startContainer":  serialize((*hooks)[StartContainer]),
                "poststart":       serialize((*hooks)[Poststart]),
                "poststop":        serialize((*hooks)[Poststop]),
        })
}

// Run executes all hooks for the given hook name.
func (hooks Hooks) Run(name HookName, state *specs.State) error {
        list := hooks[name]
        for i, h := range list {
                if err := h.Run(state); err != nil {
                        return fmt.Errorf("error running %s hook #%d: %w", name, i, err)
                }
        }

        return nil
}

// SetDefaultEnv sets the environment for those CommandHook entries
// that do not have one set.
func (hooks HookList) SetDefaultEnv(env []string) {
        for _, h := range hooks {
                if ch, ok := h.(CommandHook); ok && len(ch.Env) == 0 {
                        ch.Env = env
                }
        }
}

type Hook interface {
        // Run executes the hook with the provided state.
        Run(*specs.State) error
}

// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(*specs.State) error) FuncHook {
        return FuncHook{
                run: f,
        }
}

type FuncHook struct {
        run func(*specs.State) error
}

func (f FuncHook) Run(s *specs.State) error {
        return f.run(s)
}

type Command struct {
        Path    string         `json:"path"`
        Args    []string       `json:"args"`
        Env     []string       `json:"env"`
        Dir     string         `json:"dir"`
        Timeout *time.Duration `json:"timeout"`
}

// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd *Command) CommandHook {
        return CommandHook{
                Command: cmd,
        }
}

type CommandHook struct {
        *Command
}

func (c *Command) Run(s *specs.State) error {
        b, err := json.Marshal(s)
        if err != nil {
                return err
        }
        var stdout, stderr bytes.Buffer
        cmd := exec.Cmd{
                Path:   c.Path,
                Args:   c.Args,
                Env:    c.Env,
                Stdin:  bytes.NewReader(b),
                Stdout: &stdout,
                Stderr: &stderr,
        }
        if err := cmd.Start(); err != nil {
                return err
        }
        errC := make(chan error, 1)
        go func() {
                err := cmd.Wait()
                if err != nil {
                        err = fmt.Errorf("%w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
                }
                errC <- err
        }()
        var timerCh <-chan time.Time
        if c.Timeout != nil {
                timer := time.NewTimer(*c.Timeout)
                defer timer.Stop()
                timerCh = timer.C
        }
        select {
        case err := <-errC:
                return err
        case <-timerCh:
                _ = cmd.Process.Kill()
                <-errC
                return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
        }
}

package configs

import (
        "errors"
        "fmt"
        "math"
)

var (
        errNoUIDMap = errors.New("user namespaces enabled, but no uid mappings found")
        errNoGIDMap = errors.New("user namespaces enabled, but no gid mappings found")
)

// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details.
// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h
const (
        PerLinux   = 0x0000
        PerLinux32 = 0x0008
)

type LinuxPersonality struct {
        // Domain for the personality
        // can only contain values "LINUX" and "LINUX32"
        Domain int `json:"domain"`
}

// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
        if c.Namespaces.Contains(NEWUSER) {
                if len(c.UIDMappings) == 0 {
                        return -1, errNoUIDMap
                }
                id, found := c.hostIDFromMapping(int64(containerId), c.UIDMappings)
                if !found {
                        return -1, fmt.Errorf("user namespaces enabled, but no mapping found for uid %d", containerId)
                }
                // If we are a 32-bit binary running on a 64-bit system, it's possible
                // the mapped user is too large to store in an int, which means we
                // cannot do the mapping. We can't just return an int64, because
                // os.Setuid() takes an int.
                if id > math.MaxInt {
                        return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
                }
                return int(id), nil
        }
        // Return unchanged id.
        return containerId, nil
}

// HostRootUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootUID() (int, error) {
        return c.HostUID(0)
}

// HostGID gets the translated gid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
        if c.Namespaces.Contains(NEWUSER) {
                if len(c.GIDMappings) == 0 {
                        return -1, errNoGIDMap
                }
                id, found := c.hostIDFromMapping(int64(containerId), c.GIDMappings)
                if !found {
                        return -1, fmt.Errorf("user namespaces enabled, but no mapping found for gid %d", containerId)
                }
                // If we are a 32-bit binary running on a 64-bit system, it's possible
                // the mapped user is too large to store in an int, which means we
                // cannot do the mapping. We can't just return an int64, because
                // os.Setgid() takes an int.
                if id > math.MaxInt {
                        return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
                }
                return int(id), nil
        }
        // Return unchanged id.
        return containerId, nil
}

// HostRootGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootGID() (int, error) {
        return c.HostGID(0)
}

// Utility function that gets a host ID for a container ID from user namespace map
// if that ID is present in the map.
func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) {
        for _, m := range uMap {
                if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
                        hostID := m.HostID + (containerID - m.ContainerID)
                        return hostID, true
                }
        }
        return -1, false
}

//go:build gofuzz

package configs

func FuzzUnmarshalJSON(data []byte) int {
        hooks := Hooks{}
        _ = hooks.UnmarshalJSON(data)
        return 1
}

package configs

import "golang.org/x/sys/unix"

type MountIDMapping struct {
        // Recursive indicates if the mapping needs to be recursive.
        Recursive bool `json:"recursive"`

        // UserNSPath is a path to a user namespace that indicates the necessary
        // id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and
        // GIDMappings must be set to nil.
        UserNSPath string `json:"userns_path,omitempty"`

        // UIDMappings is the uid mapping set for this mount, to be used with
        // MOUNT_ATTR_IDMAP.
        UIDMappings []IDMap `json:"uid_mappings,omitempty"`

        // GIDMappings is the gid mapping set for this mount, to be used with
        // MOUNT_ATTR_IDMAP.
        GIDMappings []IDMap `json:"gid_mappings,omitempty"`
}

type Mount struct {
        // Source path for the mount.
        Source string `json:"source"`

        // Destination path for the mount inside the container.
        Destination string `json:"destination"`

        // Device the mount is for.
        Device string `json:"device"`

        // Mount flags.
        Flags int `json:"flags"`

        // Mount flags that were explicitly cleared in the configuration (meaning
        // the user explicitly requested that these flags *not* be set).
        ClearedFlags int `json:"cleared_flags"`

        // Propagation Flags
        PropagationFlags []int `json:"propagation_flags"`

        // Mount data applied to the mount.
        Data string `json:"data"`

        // Relabel source if set, "z" indicates shared, "Z" indicates unshared.
        Relabel string `json:"relabel"`

        // RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
        RecAttr *unix.MountAttr `json:"rec_attr"`

        // Extensions are additional flags that are specific to runc.
        Extensions int `json:"extensions"`

        // Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil,
        // the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings.
        IDMapping *MountIDMapping `json:"id_mapping,omitempty"`
}

func (m *Mount) IsBind() bool {
        return m.Flags&unix.MS_BIND != 0
}

func (m *Mount) IsIDMapped() bool {
        return m.IDMapping != nil
}

package configs

import (
        "fmt"
        "os"
        "sync"
)

const (
        NEWNET    NamespaceType = "NEWNET"
        NEWPID    NamespaceType = "NEWPID"
        NEWNS     NamespaceType = "NEWNS"
        NEWUTS    NamespaceType = "NEWUTS"
        NEWIPC    NamespaceType = "NEWIPC"
        NEWUSER   NamespaceType = "NEWUSER"
        NEWCGROUP NamespaceType = "NEWCGROUP"
        NEWTIME   NamespaceType = "NEWTIME"
)

var (
        nsLock              sync.Mutex
        supportedNamespaces = make(map[NamespaceType]bool)
)

// NsName converts the namespace type to its filename
func NsName(ns NamespaceType) string {
        switch ns {
        case NEWNET:
                return "net"
        case NEWNS:
                return "mnt"
        case NEWPID:
                return "pid"
        case NEWIPC:
                return "ipc"
        case NEWUSER:
                return "user"
        case NEWUTS:
                return "uts"
        case NEWCGROUP:
                return "cgroup"
        case NEWTIME:
                return "time"
        }
        return ""
}

// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
        nsLock.Lock()
        defer nsLock.Unlock()
        supported, ok := supportedNamespaces[ns]
        if ok {
                return supported
        }
        nsFile := NsName(ns)
        // if the namespace type is unknown, just return false
        if nsFile == "" {
                return false
        }
        // We don't need to use /proc/thread-self here because the list of
        // namespace types is unrelated to the thread. This lets us avoid having to
        // do runtime.LockOSThread.
        _, err := os.Stat("/proc/self/ns/" + nsFile)
        // a namespace is supported if it exists and we have permissions to read it
        supported = err == nil
        supportedNamespaces[ns] = supported
        return supported
}

func NamespaceTypes() []NamespaceType {
        return []NamespaceType{
                NEWUSER, // Keep user NS always first, don't move it.
                NEWIPC,
                NEWUTS,
                NEWNET,
                NEWPID,
                NEWNS,
                NEWCGROUP,
                NEWTIME,
        }
}

// Namespace defines configuration for each namespace.  It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct {
        Type NamespaceType `json:"type"`
        Path string        `json:"path"`
}

func (n *Namespace) GetPath(pid int) string {
        return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}

func (n *Namespaces) Remove(t NamespaceType) bool {
        i := n.index(t)
        if i == -1 {
                return false
        }
        *n = append((*n)[:i], (*n)[i+1:]...)
        return true
}

func (n *Namespaces) Add(t NamespaceType, path string) {
        i := n.index(t)
        if i == -1 {
                *n = append(*n, Namespace{Type: t, Path: path})
                return
        }
        (*n)[i].Path = path
}

func (n *Namespaces) index(t NamespaceType) int {
        for i, ns := range *n {
                if ns.Type == t {
                        return i
                }
        }
        return -1
}

func (n *Namespaces) Contains(t NamespaceType) bool {
        return n.index(t) != -1
}

func (n *Namespaces) PathOf(t NamespaceType) string {
        i := n.index(t)
        if i == -1 {
                return ""
        }
        return (*n)[i].Path
}

//go:build linux

package configs

import "golang.org/x/sys/unix"

func (n *Namespace) Syscall() int {
        return namespaceInfo[n.Type]
}

var namespaceInfo = map[NamespaceType]int{
        NEWNET:    unix.CLONE_NEWNET,
        NEWNS:     unix.CLONE_NEWNS,
        NEWUSER:   unix.CLONE_NEWUSER,
        NEWIPC:    unix.CLONE_NEWIPC,
        NEWUTS:    unix.CLONE_NEWUTS,
        NEWPID:    unix.CLONE_NEWPID,
        NEWCGROUP: unix.CLONE_NEWCGROUP,
        NEWTIME:   unix.CLONE_NEWTIME,
}

// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
        var flag int
        for _, v := range *n {
                if v.Path != "" {
                        continue
                }
                flag |= namespaceInfo[v.Type]
        }
        return uintptr(flag)
}

// IsPrivate tells whether the namespace of type t is configured as private
// (i.e. it exists and is not shared).
func (n Namespaces) IsPrivate(t NamespaceType) bool {
        for _, v := range n {
                if v.Type == t {
                        return v.Path == ""
                }
        }
        // Not found, so implicitly sharing a parent namespace.
        return false
}

package validate

import (
        "errors"
        "fmt"
        "strconv"
        "strings"

        "github.com/opencontainers/runc/libcontainer/configs"
)

// rootlessEUIDCheck makes sure that the config can be applied when runc
// is being executed as a non-root user (euid != 0) in the current user namespace.
func rootlessEUIDCheck(config *configs.Config) error {
        if !config.RootlessEUID {
                return nil
        }
        if err := rootlessEUIDMappings(config); err != nil {
                return err
        }
        if err := rootlessEUIDMount(config); err != nil {
                return err
        }

        // XXX: We currently can't verify the user config at all, because
        //      configs.Config doesn't store the user-related configs. So this
        //      has to be verified by setupUser() in init_linux.go.

        return nil
}

func rootlessEUIDMappings(config *configs.Config) error {
        if !config.Namespaces.Contains(configs.NEWUSER) {
                return errors.New("rootless container requires user namespaces")
        }
        // We only require mappings if we are not joining another userns.
        if config.Namespaces.IsPrivate(configs.NEWUSER) {
                if len(config.UIDMappings) == 0 {
                        return errors.New("rootless containers requires at least one UID mapping")
                }
                if len(config.GIDMappings) == 0 {
                        return errors.New("rootless containers requires at least one GID mapping")
                }
        }
        return nil
}

// rootlessEUIDMount verifies that all mounts have valid uid=/gid= options,
// i.e. their arguments has proper ID mappings.
func rootlessEUIDMount(config *configs.Config) error {
        // XXX: We could whitelist allowed devices at this point, but I'm not
        //      convinced that's a good idea. The kernel is the best arbiter of
        //      access control.

        // Check that the options list doesn't contain any uid= or gid= entries
        // that don't resolve to root.
        for _, mount := range config.Mounts {
                // Look for a common substring; skip further processing
                // if there can't be any uid= or gid= options.
                if !strings.Contains(mount.Data, "id=") {
                        continue
                }
                for _, opt := range strings.Split(mount.Data, ",") {
                        if str, ok := strings.CutPrefix(opt, "uid="); ok {
                                uid, err := strconv.Atoi(str)
                                if err != nil {
                                        // Ignore unknown mount options.
                                        continue
                                }
                                if _, err := config.HostUID(uid); err != nil {
                                        return fmt.Errorf("cannot specify %s mount option for rootless container: %w", opt, err)
                                }
                        } else if str, ok := strings.CutPrefix(opt, "gid="); ok {
                                gid, err := strconv.Atoi(str)
                                if err != nil {
                                        // Ignore unknown mount options.
                                        continue
                                }
                                if _, err := config.HostGID(gid); err != nil {
                                        return fmt.Errorf("cannot specify %s mount option for rootless container: %w", opt, err)
                                }
                        }
                }
        }

        return nil
}

package validate

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "strings"
        "sync"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/intelrdt"
        "github.com/opencontainers/runtime-spec/specs-go"
        selinux "github.com/opencontainers/selinux/go-selinux"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

type check func(config *configs.Config) error

func Validate(config *configs.Config) error {
        checks := []check{
                cgroupsCheck,
                rootfs,
                network,
                uts,
                security,
                namespaces,
                sysctl,
                intelrdtCheck,
                rootlessEUIDCheck,
                mountsStrict,
                scheduler,
                ioPriority,
        }
        for _, c := range checks {
                if err := c(config); err != nil {
                        return err
                }
        }
        // Relaxed validation rules for backward compatibility
        warns := []check{
                mountsWarn,
        }
        for _, c := range warns {
                if err := c(config); err != nil {
                        logrus.WithError(err).Warn("configuration")
                }
        }
        return nil
}

// rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func rootfs(config *configs.Config) error {
        if _, err := os.Stat(config.Rootfs); err != nil {
                return fmt.Errorf("invalid rootfs: %w", err)
        }
        cleaned, err := filepath.Abs(config.Rootfs)
        if err != nil {
                return fmt.Errorf("invalid rootfs: %w", err)
        }
        if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
                return fmt.Errorf("invalid rootfs: %w", err)
        }
        if filepath.Clean(config.Rootfs) != cleaned {
                return errors.New("invalid rootfs: not an absolute path, or a symlink")
        }
        return nil
}

func network(config *configs.Config) error {
        if !config.Namespaces.Contains(configs.NEWNET) {
                if len(config.Networks) > 0 || len(config.Routes) > 0 {
                        return errors.New("unable to apply network settings without a private NET namespace")
                }
        }
        return nil
}

func uts(config *configs.Config) error {
        if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
                return errors.New("unable to set hostname without a private UTS namespace")
        }
        if config.Domainname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
                return errors.New("unable to set domainname without a private UTS namespace")
        }
        return nil
}

func security(config *configs.Config) error {
        // restrict sys without mount namespace
        if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
                !config.Namespaces.Contains(configs.NEWNS) {
                return errors.New("unable to restrict sys entries without a private MNT namespace")
        }
        if config.ProcessLabel != "" && !selinux.GetEnabled() {
                return errors.New("selinux label is specified in config, but selinux is disabled or not supported")
        }

        return nil
}

func namespaces(config *configs.Config) error {
        if config.Namespaces.Contains(configs.NEWUSER) {
                if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
                        return errors.New("user namespaces aren't enabled in the kernel")
                }
                hasPath := config.Namespaces.PathOf(configs.NEWUSER) != ""
                hasMappings := config.UIDMappings != nil || config.GIDMappings != nil
                if !hasPath && !hasMappings {
                        return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified")
                }
                // The hasPath && hasMappings validation case is handled in specconv --
                // we cache the mappings in Config during specconv in the hasPath case,
                // so we cannot do that validation here.
        } else {
                if config.UIDMappings != nil || config.GIDMappings != nil {
                        return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config")
                }
        }

        if config.Namespaces.Contains(configs.NEWCGROUP) {
                if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) {
                        return errors.New("cgroup namespaces aren't enabled in the kernel")
                }
        }

        if config.Namespaces.Contains(configs.NEWTIME) {
                if _, err := os.Stat("/proc/self/timens_offsets"); os.IsNotExist(err) {
                        return errors.New("time namespaces aren't enabled in the kernel")
                }
                hasPath := config.Namespaces.PathOf(configs.NEWTIME) != ""
                hasOffsets := config.TimeOffsets != nil
                if hasPath && hasOffsets {
                        return errors.New("time namespace enabled, but both namespace path and time offsets specified -- you may only provide one")
                }
        } else {
                if config.TimeOffsets != nil {
                        return errors.New("time namespace offsets specified, but time namespace isn't enabled in the config")
                }
        }

        return nil
}

// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
// The '/' separator is also accepted in place of a '.'.
// Convert the sysctl variables to dots separator format for validation.
// More info: sysctl(8), sysctl.d(5).
//
// For example:
// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
// will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
func convertSysctlVariableToDotsSeparator(val string) string {
        if val == "" {
                return val
        }
        firstSepIndex := strings.IndexAny(val, "./")
        if firstSepIndex == -1 || val[firstSepIndex] == '.' {
                return val
        }

        f := func(r rune) rune {
                switch r {
                case '.':
                        return '/'
                case '/':
                        return '.'
                }
                return r
        }
        return strings.Map(f, val)
}

// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
func sysctl(config *configs.Config) error {
        validSysctlMap := map[string]bool{
                "kernel.msgmax":          true,
                "kernel.msgmnb":          true,
                "kernel.msgmni":          true,
                "kernel.sem":             true,
                "kernel.shmall":          true,
                "kernel.shmmax":          true,
                "kernel.shmmni":          true,
                "kernel.shm_rmid_forced": true,
        }

        var (
                netOnce    sync.Once
                hostnet    bool
                hostnetErr error
        )

        for s := range config.Sysctl {
                s := convertSysctlVariableToDotsSeparator(s)
                if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
                        if config.Namespaces.Contains(configs.NEWIPC) {
                                continue
                        } else {
                                return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
                        }
                }
                if strings.HasPrefix(s, "net.") {
                        // Is container using host netns?
                        // Here "host" means "current", not "initial".
                        netOnce.Do(func() {
                                if !config.Namespaces.Contains(configs.NEWNET) {
                                        hostnet = true
                                        return
                                }
                                path := config.Namespaces.PathOf(configs.NEWNET)
                                if path == "" {
                                        // own netns, so hostnet = false
                                        return
                                }
                                hostnet, hostnetErr = isHostNetNS(path)
                        })
                        if hostnetErr != nil {
                                return fmt.Errorf("invalid netns path: %w", hostnetErr)
                        }
                        if hostnet {
                                return fmt.Errorf("sysctl %q not allowed in host network namespace", s)
                        }
                        continue
                }
                if config.Namespaces.Contains(configs.NEWUTS) {
                        switch s {
                        case "kernel.domainname":
                                // This is namespaced and there's no explicit OCI field for it.
                                continue
                        case "kernel.hostname":
                                // This is namespaced but there's a conflicting (dedicated) OCI field for it.
                                return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
                        }
                }
                return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
        }

        return nil
}

func intelrdtCheck(config *configs.Config) error {
        if config.IntelRdt != nil {
                if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
                        return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
                }

                if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" {
                        return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
                }
                if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" {
                        return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
                }
        }

        return nil
}

func cgroupsCheck(config *configs.Config) error {
        c := config.Cgroups
        if c == nil {
                return nil
        }

        if (c.Name != "" || c.Parent != "") && c.Path != "" {
                return fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
        }

        r := c.Resources
        if r == nil {
                return nil
        }

        if !cgroups.IsCgroup2UnifiedMode() && r.Unified != nil {
                return cgroups.ErrV1NoUnified
        }

        if cgroups.IsCgroup2UnifiedMode() {
                _, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
                if err != nil {
                        return err
                }
        }

        return nil
}

func checkBindOptions(m *configs.Mount) error {
        if !m.IsBind() {
                return nil
        }
        // We must reject bind-mounts that also have filesystem-specific mount
        // options, because the kernel will completely ignore these flags and we
        // cannot set them per-mountpoint.
        //
        // It should be noted that (due to how the kernel caches superblocks), data
        // options could also silently ignored for other filesystems even when
        // doing a fresh mount, but there is no real way to avoid this (and it
        // matches how everything else works). There have been proposals to make it
        // possible for userspace to detect this caching, but this wouldn't help
        // runc because the behaviour wouldn't even be desirable for most users.
        if m.Data != "" {
                return errors.New("bind mounts cannot have any filesystem-specific options applied")
        }
        return nil
}

func checkIDMapMounts(config *configs.Config, m *configs.Mount) error {
        // Make sure MOUNT_ATTR_IDMAP is not set on any of our mounts. This
        // attribute is handled differently to all other attributes (through
        // m.IDMapping), so make sure we never store it in the actual config. This
        // really shouldn't ever happen.
        if m.RecAttr != nil && (m.RecAttr.Attr_set|m.RecAttr.Attr_clr)&unix.MOUNT_ATTR_IDMAP != 0 {
                return errors.New("mount configuration cannot contain recAttr for MOUNT_ATTR_IDMAP")
        }
        if !m.IsIDMapped() {
                return nil
        }
        if !m.IsBind() {
                return errors.New("id-mapped mounts are only supported for bind-mounts")
        }
        if config.RootlessEUID {
                return errors.New("id-mapped mounts are not supported for rootless containers")
        }
        if m.IDMapping.UserNSPath == "" {
                if len(m.IDMapping.UIDMappings) == 0 || len(m.IDMapping.GIDMappings) == 0 {
                        return errors.New("id-mapped mounts must have both uid and gid mappings specified")
                }
        } else {
                if m.IDMapping.UIDMappings != nil || m.IDMapping.GIDMappings != nil {
                        // should never happen
                        return errors.New("[internal error] id-mapped mounts cannot have both userns_path and uid and gid mappings specified")
                }
        }
        return nil
}

func mountsWarn(config *configs.Config) error {
        for _, m := range config.Mounts {
                if !filepath.IsAbs(m.Destination) {
                        return fmt.Errorf("mount %+v: relative destination path is **deprecated**, using it as relative to /", m)
                }
        }
        return nil
}

func mountsStrict(config *configs.Config) error {
        for _, m := range config.Mounts {
                if err := checkBindOptions(m); err != nil {
                        return fmt.Errorf("invalid mount %+v: %w", m, err)
                }
                if err := checkIDMapMounts(config, m); err != nil {
                        return fmt.Errorf("invalid mount %+v: %w", m, err)
                }
        }
        return nil
}

func isHostNetNS(path string) (bool, error) {
        const currentProcessNetns = "/proc/self/ns/net"

        var st1, st2 unix.Stat_t

        if err := unix.Stat(currentProcessNetns, &st1); err != nil {
                return false, &os.PathError{Op: "stat", Path: currentProcessNetns, Err: err}
        }
        if err := unix.Stat(path, &st2); err != nil {
                return false, &os.PathError{Op: "stat", Path: path, Err: err}
        }

        return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil
}

// scheduler is to validate scheduler configs according to https://man7.org/linux/man-pages/man2/sched_setattr.2.html
func scheduler(config *configs.Config) error {
        s := config.Scheduler
        if s == nil {
                return nil
        }
        if s.Policy == "" {
                return errors.New("scheduler policy is required")
        }
        if s.Policy == specs.SchedOther || s.Policy == specs.SchedBatch {
                if s.Nice < -20 || s.Nice > 19 {
                        return fmt.Errorf("invalid scheduler.nice: %d when scheduler.policy is %s", s.Nice, string(s.Policy))
                }
        }
        if s.Priority != 0 && (s.Policy != specs.SchedFIFO && s.Policy != specs.SchedRR) {
                return errors.New("scheduler.priority can only be specified for SchedFIFO or SchedRR policy")
        }
        if s.Policy != specs.SchedDeadline && (s.Runtime != 0 || s.Deadline != 0 || s.Period != 0) {
                return errors.New("scheduler runtime/deadline/period can only be specified for SchedDeadline policy")
        }
        return nil
}

func ioPriority(config *configs.Config) error {
        if config.IOPriority == nil {
                return nil
        }
        priority := config.IOPriority.Priority
        if priority < 0 || priority > 7 {
                return fmt.Errorf("invalid ioPriority.Priority: %d", priority)
        }

        switch class := config.IOPriority.Class; class {
        case specs.IOPRIO_CLASS_RT, specs.IOPRIO_CLASS_BE, specs.IOPRIO_CLASS_IDLE:
                // Valid class, do nothing.
        default:
                return fmt.Errorf("invalid ioPriority.Class: %q", class)
        }

        return nil
}

package libcontainer

import (
        "os"

        "golang.org/x/sys/unix"
)

// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
func mountConsole(slavePath string) error {
        f, err := os.Create("/dev/console")
        if err != nil && !os.IsExist(err) {
                return err
        }
        if f != nil {
                // Ensure permission bits (can be different because of umask).
                if err := f.Chmod(0o666); err != nil {
                        return err
                }
                f.Close()
        }
        return mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "")
}

// dupStdio opens the slavePath for the console and dups the fds to the current
// processes stdio, fd 0,1,2.
func dupStdio(slavePath string) error {
        fd, err := unix.Open(slavePath, unix.O_RDWR, 0)
        if err != nil {
                return &os.PathError{
                        Op:   "open",
                        Path: slavePath,
                        Err:  err,
                }
        }
        for _, i := range []int{0, 1, 2} {
                if err := unix.Dup3(fd, i, 0); err != nil {
                        return err
                }
        }
        return nil
}

// Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations
// after the container is created.
package libcontainer

import (
        "time"

        "github.com/opencontainers/runc/libcontainer/configs"
)

// Status is the status of a container.
type Status int

const (
        // Created is the status that denotes the container exists but has not been run yet.
        Created Status = iota
        // Running is the status that denotes the container exists and is running.
        Running
        // Paused is the status that denotes the container exists, but all its processes are paused.
        Paused
        // Stopped is the status that denotes the container does not have a created or running process.
        Stopped
)

func (s Status) String() string {
        switch s {
        case Created:
                return "created"
        case Running:
                return "running"
        case Paused:
                return "paused"
        case Stopped:
                return "stopped"
        default:
                return "unknown"
        }
}

// BaseState represents the platform agnostic pieces relating to a
// running container's state
type BaseState struct {
        // ID is the container ID.
        ID string `json:"id"`

        // InitProcessPid is the init process id in the parent namespace.
        InitProcessPid int `json:"init_process_pid"`

        // InitProcessStartTime is the init process start time in clock cycles since boot time.
        InitProcessStartTime uint64 `json:"init_process_start"`

        // Created is the unix timestamp for the creation time of the container in UTC
        Created time.Time `json:"created"`

        // Config is the container's configuration.
        Config configs.Config `json:"config"`
}

package libcontainer

import (
        "bytes"
        "errors"
        "fmt"
        "io"
        "os"
        "os/exec"
        "path"
        "path/filepath"
        "reflect"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/sirupsen/logrus"
        "github.com/vishvananda/netlink/nl"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/exeseal"
        "github.com/opencontainers/runc/libcontainer/intelrdt"
        "github.com/opencontainers/runc/libcontainer/system"
        "github.com/opencontainers/runc/libcontainer/utils"
)

const stdioFdCount = 3

// Container is a libcontainer container object.
type Container struct {
        id                   string
        stateDir             string
        config               *configs.Config
        cgroupManager        cgroups.Manager
        intelRdtManager      *intelrdt.Manager
        initProcess          parentProcess
        initProcessStartTime uint64
        m                    sync.Mutex
        criuVersion          int
        state                containerState
        created              time.Time
        fifo                 *os.File
}

// State represents a running container's state
type State struct {
        BaseState

        // Platform specific fields below here

        // Specified if the container was started under the rootless mode.
        // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
        Rootless bool `json:"rootless"`

        // Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
        //
        // For cgroup v1, a key is cgroup subsystem name, and the value is the path
        // to the cgroup for this subsystem.
        //
        // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
        CgroupPaths map[string]string `json:"cgroup_paths"`

        // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
        // with the value as the path.
        NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`

        // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
        ExternalDescriptors []string `json:"external_descriptors,omitempty"`

        // Intel RDT "resource control" filesystem path
        IntelRdtPath string `json:"intel_rdt_path"`
}

// ID returns the container's unique ID
func (c *Container) ID() string {
        return c.id
}

// Config returns the container's configuration
func (c *Container) Config() configs.Config {
        return *c.config
}

// Status returns the current status of the container.
func (c *Container) Status() (Status, error) {
        c.m.Lock()
        defer c.m.Unlock()
        return c.currentStatus()
}

// State returns the current container's state information.
func (c *Container) State() (*State, error) {
        c.m.Lock()
        defer c.m.Unlock()
        return c.currentState(), nil
}

// OCIState returns the current container's state information.
func (c *Container) OCIState() (*specs.State, error) {
        c.m.Lock()
        defer c.m.Unlock()
        return c.currentOCIState()
}

// ignoreCgroupError filters out cgroup-related errors that can be ignored,
// because the container is stopped and its cgroup is gone.
func (c *Container) ignoreCgroupError(err error) error {
        if err == nil {
                return nil
        }
        if errors.Is(err, os.ErrNotExist) && !c.hasInit() && !c.cgroupManager.Exists() {
                return nil
        }
        return err
}

// Processes returns the PIDs inside this container. The PIDs are in the
// namespace of the calling process.
//
// Some of the returned PIDs may no longer refer to processes in the container,
// unless the container state is PAUSED in which case every PID in the slice is
// valid.
func (c *Container) Processes() ([]int, error) {
        pids, err := c.cgroupManager.GetAllPids()
        if err = c.ignoreCgroupError(err); err != nil {
                return nil, fmt.Errorf("unable to get all container pids: %w", err)
        }
        return pids, nil
}

// Stats returns statistics for the container.
func (c *Container) Stats() (*Stats, error) {
        var (
                err   error
                stats = &Stats{}
        )
        if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
                return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
        }
        if c.intelRdtManager != nil {
                if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
                        return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
                }
        }
        for _, iface := range c.config.Networks {
                switch iface.Type {
                case "veth":
                        istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
                        if err != nil {
                                return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
                        }
                        stats.Interfaces = append(stats.Interfaces, istats)
                }
        }
        return stats, nil
}

// Set resources of container as configured. Can be used to change resources
// when the container is running.
func (c *Container) Set(config configs.Config) error {
        c.m.Lock()
        defer c.m.Unlock()
        status, err := c.currentStatus()
        if err != nil {
                return err
        }
        if status == Stopped {
                return ErrNotRunning
        }
        if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
                // Set configs back
                if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
                        logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
                }
                return err
        }
        if c.intelRdtManager != nil {
                if err := c.intelRdtManager.Set(&config); err != nil {
                        // Set configs back
                        if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
                                logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
                        }
                        if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
                                logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
                        }
                        return err
                }
        }
        // After config setting succeed, update config and states
        c.config = &config
        _, err = c.updateState(nil)
        return err
}

// Start starts a process inside the container. Returns error if process fails
// to start. You can track process lifecycle with passed Process structure.
func (c *Container) Start(process *Process) error {
        c.m.Lock()
        defer c.m.Unlock()
        return c.start(process)
}

// Run immediately starts the process inside the container. Returns an error if
// the process fails to start. It does not block waiting for the exec fifo
// after start returns but opens the fifo after start returns.
func (c *Container) Run(process *Process) error {
        c.m.Lock()
        defer c.m.Unlock()
        if err := c.start(process); err != nil {
                return err
        }
        if process.Init {
                return c.exec()
        }
        return nil
}

// Exec signals the container to exec the users process at the end of the init.
func (c *Container) Exec() error {
        c.m.Lock()
        defer c.m.Unlock()
        return c.exec()
}

func (c *Container) exec() error {
        path := filepath.Join(c.stateDir, execFifoFilename)
        pid := c.initProcess.pid()
        blockingFifoOpenCh := awaitFifoOpen(path)
        for {
                select {
                case result := <-blockingFifoOpenCh:
                        return handleFifoResult(result)

                case <-time.After(time.Millisecond * 100):
                        stat, err := system.Stat(pid)
                        if err != nil || stat.State == system.Zombie {
                                // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
                                // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
                                if err := handleFifoResult(fifoOpen(path, false)); err != nil {
                                        return errors.New("container process is already dead")
                                }
                                return nil
                        }
                }
        }
}

func readFromExecFifo(execFifo io.Reader) error {
        data, err := io.ReadAll(execFifo)
        if err != nil {
                return err
        }
        if len(data) <= 0 {
                return errors.New("cannot start an already running container")
        }
        return nil
}

func awaitFifoOpen(path string) <-chan openResult {
        fifoOpened := make(chan openResult)
        go func() {
                result := fifoOpen(path, true)
                fifoOpened <- result
        }()
        return fifoOpened
}

func fifoOpen(path string, block bool) openResult {
        flags := os.O_RDONLY
        if !block {
                flags |= unix.O_NONBLOCK
        }
        f, err := os.OpenFile(path, flags, 0)
        if err != nil {
                return openResult{err: fmt.Errorf("exec fifo: %w", err)}
        }
        return openResult{file: f}
}

func handleFifoResult(result openResult) error {
        if result.err != nil {
                return result.err
        }
        f := result.file
        defer f.Close()
        if err := readFromExecFifo(f); err != nil {
                return err
        }
        return os.Remove(f.Name())
}

type openResult struct {
        file *os.File
        err  error
}

func (c *Container) start(process *Process) (retErr error) {
        if c.config.Cgroups.Resources.SkipDevices {
                return errors.New("can't start container with SkipDevices set")
        }

        if c.config.RootlessEUID && len(process.AdditionalGroups) > 0 {
                // We cannot set any additional groups in a rootless container
                // and thus we bail if the user asked us to do so.
                return errors.New("cannot set any additional groups in a rootless container")
        }

        if process.Init {
                if c.initProcessStartTime != 0 {
                        return errors.New("container already has init process")
                }
                if err := c.createExecFifo(); err != nil {
                        return err
                }
                defer func() {
                        if retErr != nil {
                                c.deleteExecFifo()
                        }
                }()
        }

        parent, err := c.newParentProcess(process)
        if err != nil {
                return fmt.Errorf("unable to create new parent process: %w", err)
        }
        // We do not need the cloned binaries once the process is spawned.
        defer process.closeClonedExes()

        logsDone := parent.forwardChildLogs()

        // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
        // to make sure we don't leak any files into "runc init". Any files to be
        // passed to "runc init" through ExtraFiles will get dup2'd by the Go
        // runtime and thus their O_CLOEXEC flag will be cleared. This is some
        // additional protection against attacks like CVE-2024-21626, by making
        // sure we never leak files to "runc init" we didn't intend to.
        if err := utils.CloseExecFrom(3); err != nil {
                return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
        }
        if err := parent.start(); err != nil {
                return fmt.Errorf("unable to start container process: %w", err)
        }

        if logsDone != nil {
                defer func() {
                        // Wait for log forwarder to finish. This depends on
                        // runc init closing the _LIBCONTAINER_LOGPIPE log fd.
                        err := <-logsDone
                        if err != nil && retErr == nil {
                                retErr = fmt.Errorf("unable to forward init logs: %w", err)
                        }
                }()
        }

        if process.Init {
                c.fifo.Close()
                if c.config.HasHook(configs.Poststart) {
                        s, err := c.currentOCIState()
                        if err != nil {
                                return err
                        }

                        if err := c.config.Hooks.Run(configs.Poststart, s); err != nil {
                                if err := ignoreTerminateErrors(parent.terminate()); err != nil {
                                        logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
                                }
                                return err
                        }
                }
        }
        return nil
}

// Signal sends a specified signal to container's init.
//
// When s is SIGKILL and the container does not have its own PID namespace, all
// the container's processes are killed. In this scenario, the libcontainer
// user may be required to implement a proper child reaper.
func (c *Container) Signal(s os.Signal) error {
        c.m.Lock()
        defer c.m.Unlock()

        // When a container has its own PID namespace, inside it the init PID
        // is 1, and thus it is handled specially by the kernel. In particular,
        // killing init with SIGKILL from an ancestor namespace will also kill
        // all other processes in that PID namespace (see pid_namespaces(7)).
        //
        // OTOH, if PID namespace is shared, we should kill all pids to avoid
        // leftover processes. Handle this special case here.
        if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) {
                if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
                        if c.config.RootlessCgroups { // may not have an access to cgroup
                                logrus.WithError(err).Warn("failed to kill all processes, possibly due to lack of cgroup (Hint: enable cgroup v2 delegation)")
                                // Some processes may leak when cgroup is not delegated
                                // https://github.com/opencontainers/runc/pull/4395#pullrequestreview-2291179652
                                return c.signal(s)
                        }
                        // For not rootless container, if there is no init process and no cgroup,
                        // it means that the container is not running.
                        if errors.Is(err, ErrCgroupNotExist) && !c.hasInit() {
                                err = ErrNotRunning
                        }
                        return fmt.Errorf("unable to kill all processes: %w", err)
                }
                return nil
        }

        return c.signal(s)
}

func (c *Container) signal(s os.Signal) error {
        // To avoid a PID reuse attack, don't kill non-running container.
        if !c.hasInit() {
                return ErrNotRunning
        }
        if err := c.initProcess.signal(s); err != nil {
                return fmt.Errorf("unable to signal init: %w", err)
        }
        if s == unix.SIGKILL {
                // For cgroup v1, killing a process in a frozen cgroup
                // does nothing until it's thawed. Only thaw the cgroup
                // for SIGKILL.
                if paused, _ := c.isPaused(); paused {
                        _ = c.cgroupManager.Freeze(cgroups.Thawed)
                }
        }
        return nil
}

func (c *Container) createExecFifo() (retErr error) {
        rootuid, err := c.config.HostRootUID()
        if err != nil {
                return err
        }
        rootgid, err := c.config.HostRootGID()
        if err != nil {
                return err
        }

        fifoName := filepath.Join(c.stateDir, execFifoFilename)
        if err := unix.Mkfifo(fifoName, 0o622); err != nil {
                return &os.PathError{Op: "mkfifo", Path: fifoName, Err: err}
        }
        defer func() {
                if retErr != nil {
                        os.Remove(fifoName)
                }
        }()
        // Ensure permission bits (can be different because of umask).
        if err := os.Chmod(fifoName, 0o622); err != nil {
                return err
        }
        return os.Chown(fifoName, rootuid, rootgid)
}

func (c *Container) deleteExecFifo() {
        fifoName := filepath.Join(c.stateDir, execFifoFilename)
        os.Remove(fifoName)
}

// includeExecFifo opens the container's execfifo as a pathfd, so that the
// container cannot access the statedir (and the FIFO itself remains
// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
func (c *Container) includeExecFifo(cmd *exec.Cmd) error {
        fifoName := filepath.Join(c.stateDir, execFifoFilename)
        fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
        if err != nil {
                return err
        }
        c.fifo = fifo

        cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
        cmd.Env = append(cmd.Env,
                "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
        return nil
}

func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
        comm, err := newProcessComm()
        if err != nil {
                return nil, err
        }

        // Make sure we use a new safe copy of /proc/self/exe binary each time, this
        // is called to make sure that if a container manages to overwrite the file,
        // it cannot affect other containers on the system. For runc, this code will
        // only ever be called once, but libcontainer users might call this more than
        // once.
        p.closeClonedExes()
        var (
                exePath string
                safeExe *os.File
        )
        if exeseal.IsSelfExeCloned() {
                // /proc/self/exe is already a cloned binary -- no need to do anything
                logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
                // We don't need to use /proc/thread-self here because the exe mm of a
                // thread-group is guaranteed to be the same for all threads by
                // definition. This lets us avoid having to do runtime.LockOSThread.
                exePath = "/proc/self/exe"
        } else {
                var err error
                safeExe, err = exeseal.CloneSelfExe(c.stateDir)
                if err != nil {
                        return nil, fmt.Errorf("unable to create safe /proc/self/exe clone for runc init: %w", err)
                }
                exePath = "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
                p.clonedExes = append(p.clonedExes, safeExe)
                logrus.Debug("runc exeseal: using /proc/self/exe clone") // used for tests
        }

        cmd := exec.Command(exePath, "init")
        cmd.Args[0] = os.Args[0]
        cmd.Stdin = p.Stdin
        cmd.Stdout = p.Stdout
        cmd.Stderr = p.Stderr
        cmd.Dir = c.config.Rootfs
        if cmd.SysProcAttr == nil {
                cmd.SysProcAttr = &unix.SysProcAttr{}
        }
        cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
        cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
        if p.ConsoleSocket != nil {
                cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
                cmd.Env = append(cmd.Env,
                        "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
                )
        }

        cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
        cmd.Env = append(cmd.Env,
                "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
        )
        cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File())
        cmd.Env = append(cmd.Env,
                "_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
        )

        cmd.ExtraFiles = append(cmd.ExtraFiles, comm.logPipeChild)
        cmd.Env = append(cmd.Env,
                "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
        if p.LogLevel != "" {
                cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGLEVEL="+p.LogLevel)
        }

        if p.PidfdSocket != nil {
                cmd.ExtraFiles = append(cmd.ExtraFiles, p.PidfdSocket)
                cmd.Env = append(cmd.Env,
                        "_LIBCONTAINER_PIDFD_SOCK="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
                )
        }

        // TODO: After https://go-review.googlesource.com/c/go/+/515799 included
        // in go versions supported by us, we can remove this logic.
        if safeExe != nil {
                // Due to a Go stdlib bug, we need to add safeExe to the set of
                // ExtraFiles otherwise it is possible for the stdlib to clobber the fd
                // during forkAndExecInChild1 and replace it with some other file that
                // might be malicious. This is less than ideal (because the descriptor
                // will be non-O_CLOEXEC) however we have protections in "runc init" to
                // stop us from leaking extra file descriptors.
                //
                // See <https://github.com/golang/go/issues/61751>.
                cmd.ExtraFiles = append(cmd.ExtraFiles, safeExe)

                // There is a race situation when we are opening a file, if there is a
                // small fd was closed at that time, maybe it will be reused by safeExe.
                // Because of Go stdlib fds shuffling bug, if the fd of safeExe is too
                // small, go stdlib will dup3 it to another fd, or dup3 a other fd to this
                // fd, then it will cause the fd type cmd.Path refers to a random path,
                // and it can lead to an error "permission denied" when starting the process.
                // Please see #4294.
                // So we should not use the original fd of safeExe, but use the fd after
                // shuffled by Go stdlib. Because Go stdlib will guarantee this fd refers to
                // the correct file.
                cmd.Path = "/proc/self/fd/" + strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)
        }

        // NOTE: when running a container with no PID namespace and the parent
        //       process spawning the container is PID1 the pdeathsig is being
        //       delivered to the container's init process by the kernel for some
        //       reason even with the parent still running.
        if c.config.ParentDeathSignal > 0 {
                cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
        }

        if p.Init {
                // We only set up fifoFd if we're not doing a `runc exec`. The historic
                // reason for this is that previously we would pass a dirfd that allowed
                // for container rootfs escape (and not doing it in `runc exec` avoided
                // that problem), but we no longer do that. However, there's no need to do
                // this for `runc exec` so we just keep it this way to be safe.
                if err := c.includeExecFifo(cmd); err != nil {
                        return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
                }
                return c.newInitProcess(p, cmd, comm)
        }
        return c.newSetnsProcess(p, cmd, comm)
}

func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
        cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
        nsMaps := make(map[configs.NamespaceType]string)
        for _, ns := range c.config.Namespaces {
                if ns.Path != "" {
                        nsMaps[ns.Type] = ns.Path
                }
        }
        data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
        if err != nil {
                return nil, err
        }

        init := &initProcess{
                containerProcess: containerProcess{
                        cmd:           cmd,
                        comm:          comm,
                        manager:       c.cgroupManager,
                        config:        c.newInitConfig(p),
                        process:       p,
                        bootstrapData: data,
                        container:     c,
                },
                intelRdtManager: c.intelRdtManager,
        }
        c.initProcess = init
        return init, nil
}

func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) {
        cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
        state := c.currentState()
        // for setns process, we don't have to set cloneflags as the process namespaces
        // will only be set via setns syscall
        data, err := c.bootstrapData(0, state.NamespacePaths)
        if err != nil {
                return nil, err
        }
        proc := &setnsProcess{
                containerProcess: containerProcess{
                        cmd:           cmd,
                        comm:          comm,
                        manager:       c.cgroupManager,
                        config:        c.newInitConfig(p),
                        process:       p,
                        bootstrapData: data,
                        container:     c,
                },
                cgroupPaths:     state.CgroupPaths,
                rootlessCgroups: c.config.RootlessCgroups,
                intelRdtPath:    state.IntelRdtPath,
                initProcessPid:  state.InitProcessPid,
        }
        if len(p.SubCgroupPaths) > 0 {
                if add, ok := p.SubCgroupPaths[""]; ok {
                        // cgroup v1: using the same path for all controllers.
                        // cgroup v2: the only possible way.
                        for k := range proc.cgroupPaths {
                                subPath := path.Join(proc.cgroupPaths[k], add)
                                if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
                                        return nil, fmt.Errorf("%s is not a sub cgroup path", add)
                                }
                                proc.cgroupPaths[k] = subPath
                        }
                        // cgroup v2: do not try to join init process's cgroup
                        // as a fallback (see (*setnsProcess).start).
                        proc.initProcessPid = 0
                } else {
                        // Per-controller paths.
                        for ctrl, add := range p.SubCgroupPaths {
                                if val, ok := proc.cgroupPaths[ctrl]; ok {
                                        subPath := path.Join(val, add)
                                        if !strings.HasPrefix(subPath, val) {
                                                return nil, fmt.Errorf("%s is not a sub cgroup path", add)
                                        }
                                        proc.cgroupPaths[ctrl] = subPath
                                } else {
                                        return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
                                }
                        }
                }
        }
        return proc, nil
}

func (c *Container) newInitConfig(process *Process) *initConfig {
        // Set initial properties. For those properties that exist
        // both in the container config and the process, use the ones
        // from the container config first, and override them later.
        cfg := &initConfig{
                Config:           c.config,
                Args:             process.Args,
                Env:              process.Env,
                UID:              process.UID,
                GID:              process.GID,
                AdditionalGroups: process.AdditionalGroups,
                Cwd:              process.Cwd,
                Capabilities:     c.config.Capabilities,
                PassedFilesCount: len(process.ExtraFiles),
                ContainerID:      c.ID(),
                NoNewPrivileges:  c.config.NoNewPrivileges,
                AppArmorProfile:  c.config.AppArmorProfile,
                ProcessLabel:     c.config.ProcessLabel,
                Rlimits:          c.config.Rlimits,
                IOPriority:       c.config.IOPriority,
                Scheduler:        c.config.Scheduler,
                CreateConsole:    process.ConsoleSocket != nil,
                ConsoleWidth:     process.ConsoleWidth,
                ConsoleHeight:    process.ConsoleHeight,
        }

        // Overwrite config properties with ones from process.

        if process.Capabilities != nil {
                cfg.Capabilities = process.Capabilities
        }
        if process.NoNewPrivileges != nil {
                cfg.NoNewPrivileges = *process.NoNewPrivileges
        }
        if process.AppArmorProfile != "" {
                cfg.AppArmorProfile = process.AppArmorProfile
        }
        if process.Label != "" {
                cfg.ProcessLabel = process.Label
        }
        if len(process.Rlimits) > 0 {
                cfg.Rlimits = process.Rlimits
        }
        if process.IOPriority != nil {
                cfg.IOPriority = process.IOPriority
        }
        if process.Scheduler != nil {
                cfg.Scheduler = process.Scheduler
        }

        // Set misc properties.

        if cgroups.IsCgroup2UnifiedMode() {
                cfg.Cgroup2Path = c.cgroupManager.Path("")
        }

        return cfg
}

// Destroy destroys the container, if its in a valid state.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// Running containers must first be stopped using Signal.
// Paused containers must first be resumed using Resume.
func (c *Container) Destroy() error {
        c.m.Lock()
        defer c.m.Unlock()
        if err := c.state.destroy(); err != nil {
                return fmt.Errorf("unable to destroy container: %w", err)
        }
        return nil
}

// Pause pauses the container, if its state is RUNNING or CREATED, changing
// its state to PAUSED. If the state is already PAUSED, does nothing.
func (c *Container) Pause() error {
        c.m.Lock()
        defer c.m.Unlock()
        status, err := c.currentStatus()
        if err != nil {
                return err
        }
        switch status {
        case Running, Created:
                if err := c.cgroupManager.Freeze(cgroups.Frozen); err != nil {
                        return err
                }
                return c.state.transition(&pausedState{
                        c: c,
                })
        }
        return ErrNotRunning
}

// Resume resumes the execution of any user processes in the
// container before setting the container state to RUNNING.
// This is only performed if the current state is PAUSED.
// If the Container state is RUNNING, does nothing.
func (c *Container) Resume() error {
        c.m.Lock()
        defer c.m.Unlock()
        status, err := c.currentStatus()
        if err != nil {
                return err
        }
        if status != Paused {
                return ErrNotPaused
        }
        if err := c.cgroupManager.Freeze(cgroups.Thawed); err != nil {
                return err
        }
        return c.state.transition(&runningState{
                c: c,
        })
}

// NotifyOOM returns a read-only channel signaling when the container receives
// an OOM notification.
func (c *Container) NotifyOOM() (<-chan struct{}, error) {
        // XXX(cyphar): This requires cgroups.
        if c.config.RootlessCgroups {
                logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
        }
        path := c.cgroupManager.Path("memory")
        if cgroups.IsCgroup2UnifiedMode() {
                return notifyOnOOMV2(path)
        }
        return notifyOnOOM(path)
}

// NotifyMemoryPressure returns a read-only channel signaling when the
// container reaches a given pressure level.
func (c *Container) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
        // XXX(cyphar): This requires cgroups.
        if c.config.RootlessCgroups {
                logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
        }
        return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
}

func (c *Container) updateState(process parentProcess) (*State, error) {
        if process != nil {
                c.initProcess = process
        }
        state := c.currentState()
        if err := c.saveState(state); err != nil {
                return nil, err
        }
        return state, nil
}

func (c *Container) saveState(s *State) (retErr error) {
        tmpFile, err := os.CreateTemp(c.stateDir, "state-")
        if err != nil {
                return err
        }

        defer func() {
                if retErr != nil {
                        tmpFile.Close()
                        os.Remove(tmpFile.Name())
                }
        }()

        err = utils.WriteJSON(tmpFile, s)
        if err != nil {
                return err
        }
        err = tmpFile.Close()
        if err != nil {
                return err
        }

        stateFilePath := filepath.Join(c.stateDir, stateFilename)
        return os.Rename(tmpFile.Name(), stateFilePath)
}

func (c *Container) currentStatus() (Status, error) {
        if err := c.refreshState(); err != nil {
                return -1, err
        }
        return c.state.status(), nil
}

// refreshState needs to be called to verify that the current state on the
// container is what is true.  Because consumers of libcontainer can use it
// out of process we need to verify the container's status based on runtime
// information and not rely on our in process info.
func (c *Container) refreshState() error {
        paused, err := c.isPaused()
        if err != nil {
                return err
        }
        if paused {
                return c.state.transition(&pausedState{c: c})
        }
        if !c.hasInit() {
                return c.state.transition(&stoppedState{c: c})
        }
        // The presence of exec fifo helps to distinguish between
        // the created and the running states.
        if _, err := os.Stat(filepath.Join(c.stateDir, execFifoFilename)); err == nil {
                return c.state.transition(&createdState{c: c})
        }
        return c.state.transition(&runningState{c: c})
}

// hasInit tells whether the container init process exists.
func (c *Container) hasInit() bool {
        if c.initProcess == nil {
                return false
        }
        pid := c.initProcess.pid()
        stat, err := system.Stat(pid)
        if err != nil {
                return false
        }
        if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
                return false
        }
        return true
}

func (c *Container) isPaused() (bool, error) {
        state, err := c.cgroupManager.GetFreezerState()
        if err != nil {
                return false, err
        }
        return state == cgroups.Frozen, nil
}

func (c *Container) currentState() *State {
        var (
                startTime           uint64
                externalDescriptors []string
                pid                 = -1
        )
        if c.initProcess != nil {
                pid = c.initProcess.pid()
                startTime, _ = c.initProcess.startTime()
                externalDescriptors = c.initProcess.externalDescriptors()
        }

        intelRdtPath := ""
        if c.intelRdtManager != nil {
                intelRdtPath = c.intelRdtManager.GetPath()
        }
        state := &State{
                BaseState: BaseState{
                        ID:                   c.ID(),
                        Config:               *c.config,
                        InitProcessPid:       pid,
                        InitProcessStartTime: startTime,
                        Created:              c.created,
                },
                Rootless:            c.config.RootlessEUID && c.config.RootlessCgroups,
                CgroupPaths:         c.cgroupManager.GetPaths(),
                IntelRdtPath:        intelRdtPath,
                NamespacePaths:      make(map[configs.NamespaceType]string),
                ExternalDescriptors: externalDescriptors,
        }
        if pid > 0 {
                for _, ns := range c.config.Namespaces {
                        state.NamespacePaths[ns.Type] = ns.GetPath(pid)
                }
                for _, nsType := range configs.NamespaceTypes() {
                        if !configs.IsNamespaceSupported(nsType) {
                                continue
                        }
                        if _, ok := state.NamespacePaths[nsType]; !ok {
                                ns := configs.Namespace{Type: nsType}
                                state.NamespacePaths[ns.Type] = ns.GetPath(pid)
                        }
                }
        }
        return state
}

func (c *Container) currentOCIState() (*specs.State, error) {
        bundle, annotations := utils.Annotations(c.config.Labels)
        state := &specs.State{
                Version:     specs.Version,
                ID:          c.ID(),
                Bundle:      bundle,
                Annotations: annotations,
        }
        status, err := c.currentStatus()
        if err != nil {
                return nil, err
        }
        state.Status = specs.ContainerState(status.String())
        if status != Stopped {
                if c.initProcess != nil {
                        state.Pid = c.initProcess.pid()
                }
        }
        return state, nil
}

// orderNamespacePaths sorts namespace paths into a list of paths that we
// can setns in order.
func (c *Container) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
        paths := []string{}
        for _, ns := range configs.NamespaceTypes() {

                // Remove namespaces that we don't need to join.
                if !c.config.Namespaces.Contains(ns) {
                        continue
                }

                if p, ok := namespaces[ns]; ok && p != "" {
                        // check if the requested namespace is supported
                        if !configs.IsNamespaceSupported(ns) {
                                return nil, fmt.Errorf("namespace %s is not supported", ns)
                        }
                        // only set to join this namespace if it exists
                        if _, err := os.Lstat(p); err != nil {
                                return nil, fmt.Errorf("namespace path: %w", err)
                        }
                        // do not allow namespace path with comma as we use it to separate
                        // the namespace paths
                        if strings.ContainsRune(p, ',') {
                                return nil, fmt.Errorf("invalid namespace path %s", p)
                        }
                        paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
                }

        }

        return paths, nil
}

func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
        data := bytes.NewBuffer(nil)
        for _, im := range idMap {
                line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
                if _, err := data.WriteString(line); err != nil {
                        return nil, err
                }
        }
        return data.Bytes(), nil
}

// netlinkError is an error wrapper type for use by custom netlink message
// types. Panics with errors are wrapped in netlinkError so that the recover
// in bootstrapData can distinguish intentional panics.
type netlinkError struct{ error }

// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
        // create the netlink message
        r := nl.NewNetlinkRequest(int(InitMsg), 0)

        // Our custom messages cannot bubble up an error using returns, instead
        // they will panic with the specific error type, netlinkError. In that
        // case, recover from the panic and return that as an error.
        defer func() {
                if r := recover(); r != nil {
                        if e, ok := r.(netlinkError); ok {
                                Err = e.error
                        } else {
                                panic(r)
                        }
                }
        }()

        // write cloneFlags
        r.AddData(&Int32msg{
                Type:  CloneFlagsAttr,
                Value: uint32(cloneFlags),
        })

        // write custom namespace paths
        if len(nsMaps) > 0 {
                nsPaths, err := c.orderNamespacePaths(nsMaps)
                if err != nil {
                        return nil, err
                }
                r.AddData(&Bytemsg{
                        Type:  NsPathsAttr,
                        Value: []byte(strings.Join(nsPaths, ",")),
                })
        }

        // write namespace paths only when we are not joining an existing user ns
        _, joinExistingUser := nsMaps[configs.NEWUSER]
        if !joinExistingUser {
                // write uid mappings
                if len(c.config.UIDMappings) > 0 {
                        if c.config.RootlessEUID {
                                // We resolve the paths for new{u,g}idmap from
                                // the context of runc to avoid doing a path
                                // lookup in the nsexec context.
                                if path, err := exec.LookPath("newuidmap"); err == nil {
                                        r.AddData(&Bytemsg{
                                                Type:  UidmapPathAttr,
                                                Value: []byte(path),
                                        })
                                }
                        }
                        b, err := encodeIDMapping(c.config.UIDMappings)
                        if err != nil {
                                return nil, err
                        }
                        r.AddData(&Bytemsg{
                                Type:  UidmapAttr,
                                Value: b,
                        })
                }

                // write gid mappings
                if len(c.config.GIDMappings) > 0 {
                        b, err := encodeIDMapping(c.config.GIDMappings)
                        if err != nil {
                                return nil, err
                        }
                        r.AddData(&Bytemsg{
                                Type:  GidmapAttr,
                                Value: b,
                        })
                        if c.config.RootlessEUID {
                                if path, err := exec.LookPath("newgidmap"); err == nil {
                                        r.AddData(&Bytemsg{
                                                Type:  GidmapPathAttr,
                                                Value: []byte(path),
                                        })
                                }
                        }
                        if requiresRootOrMappingTool(c.config) {
                                r.AddData(&Boolmsg{
                                        Type:  SetgroupAttr,
                                        Value: true,
                                })
                        }
                }
        }

        if c.config.OomScoreAdj != nil {
                // write oom_score_adj
                r.AddData(&Bytemsg{
                        Type:  OomScoreAdjAttr,
                        Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
                })
        }

        // write rootless
        r.AddData(&Boolmsg{
                Type:  RootlessEUIDAttr,
                Value: c.config.RootlessEUID,
        })

        // write boottime and monotonic time ns offsets only when we are not joining an existing time ns
        _, joinExistingTime := nsMaps[configs.NEWTIME]
        if !joinExistingTime && c.config.TimeOffsets != nil {
                var offsetSpec bytes.Buffer
                for clock, offset := range c.config.TimeOffsets {
                        fmt.Fprintf(&offsetSpec, "%s %d %d\n", clock, offset.Secs, offset.Nanosecs)
                }
                r.AddData(&Bytemsg{
                        Type:  TimeOffsetsAttr,
                        Value: offsetSpec.Bytes(),
                })
        }

        return bytes.NewReader(r.Serialize()), nil
}

// ignoreTerminateErrors returns nil if the given err matches an error known
// to indicate that the terminate occurred successfully or err was nil, otherwise
// err is returned unaltered.
func ignoreTerminateErrors(err error) error {
        if err == nil {
                return nil
        }
        // terminate() might return an error from either Kill or Wait.
        // The (*Cmd).Wait documentation says: "If the command fails to run
        // or doesn't complete successfully, the error is of type *ExitError".
        // Filter out such errors (like "exit status 1" or "signal: killed").
        var exitErr *exec.ExitError
        if errors.As(err, &exitErr) {
                return nil
        }
        if errors.Is(err, os.ErrProcessDone) {
                return nil
        }
        s := err.Error()
        if strings.Contains(s, "Wait was already called") {
                return nil
        }
        return err
}

func requiresRootOrMappingTool(c *configs.Config) bool {
        gidMap := []configs.IDMap{
                {ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
        }
        return !reflect.DeepEqual(c.GIDMappings, gidMap)
}

package libcontainer

import (
        "fmt"
        "os"
        "testing"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/system"
)

type mockCgroupManager struct {
        pids    []int
        allPids []int
        paths   map[string]string
}

func (m *mockCgroupManager) GetPids() ([]int, error) {
        return m.pids, nil
}

func (m *mockCgroupManager) GetAllPids() ([]int, error) {
        return m.allPids, nil
}

func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) {
        return nil, nil
}

func (m *mockCgroupManager) Apply(pid int) error {
        return nil
}

func (m *mockCgroupManager) Set(_ *cgroups.Resources) error {
        return nil
}

func (m *mockCgroupManager) Destroy() error {
        return nil
}

func (m *mockCgroupManager) Exists() bool {
        _, err := os.Lstat(m.Path("devices"))
        return err == nil
}

func (m *mockCgroupManager) OOMKillCount() (uint64, error) {
        return 0, nil
}

func (m *mockCgroupManager) GetPaths() map[string]string {
        return m.paths
}

func (m *mockCgroupManager) Path(subsys string) string {
        return m.paths[subsys]
}

func (m *mockCgroupManager) Freeze(_ cgroups.FreezerState) error {
        return nil
}

func (m *mockCgroupManager) GetCgroups() (*cgroups.Cgroup, error) {
        return nil, nil
}

func (m *mockCgroupManager) GetFreezerState() (cgroups.FreezerState, error) {
        return cgroups.Thawed, nil
}

type mockProcess struct {
        _pid    int
        started uint64
}

func (m *mockProcess) terminate() error {
        return nil
}

func (m *mockProcess) pid() int {
        return m._pid
}

func (m *mockProcess) startTime() (uint64, error) {
        return m.started, nil
}

func (m *mockProcess) start() error {
        return nil
}

func (m *mockProcess) wait() (*os.ProcessState, error) {
        return nil, nil
}

func (m *mockProcess) signal(_ os.Signal) error {
        return nil
}

func (m *mockProcess) externalDescriptors() []string {
        return []string{}
}

func (m *mockProcess) setExternalDescriptors(newFds []string) {
}

func (m *mockProcess) forwardChildLogs() chan error {
        return nil
}

func TestGetContainerPids(t *testing.T) {
        pid := 1
        stat, err := system.Stat(pid)
        if err != nil {
                t.Fatalf("can't stat pid %d, got %v", pid, err)
        }
        container := &Container{
                id:     "myid",
                config: &configs.Config{},
                cgroupManager: &mockCgroupManager{
                        allPids: []int{1, 2, 3},
                        paths: map[string]string{
                                "device": "/proc/self/cgroups",
                        },
                },
                initProcess: &mockProcess{
                        _pid:    1,
                        started: 10,
                },
                initProcessStartTime: stat.StartTime,
        }
        container.state = &runningState{c: container}
        pids, err := container.Processes()
        if err != nil {
                t.Fatal(err)
        }
        for i, expected := range []int{1, 2, 3} {
                if pids[i] != expected {
                        t.Fatalf("expected pid %d but received %d", expected, pids[i])
                }
        }
}

func TestGetContainerState(t *testing.T) {
        var (
                pid                 = os.Getpid()
                expectedMemoryPath  = "/sys/fs/cgroup/memory/myid"
                expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid)
        )
        container := &Container{
                id: "myid",
                config: &configs.Config{
                        Namespaces: []configs.Namespace{
                                {Type: configs.NEWPID},
                                {Type: configs.NEWNS},
                                {Type: configs.NEWNET, Path: expectedNetworkPath},
                                {Type: configs.NEWUTS},
                                // emulate host for IPC
                                //{Type: configs.NEWIPC},
                                {Type: configs.NEWCGROUP},
                        },
                },
                initProcess: &mockProcess{
                        _pid:    pid,
                        started: 10,
                },
                cgroupManager: &mockCgroupManager{
                        pids: []int{1, 2, 3},
                        paths: map[string]string{
                                "memory": expectedMemoryPath,
                        },
                },
        }
        container.state = &createdState{c: container}
        state, err := container.State()
        if err != nil {
                t.Fatal(err)
        }
        if state.InitProcessPid != pid {
                t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid)
        }
        if state.InitProcessStartTime != 10 {
                t.Fatalf("expected process start time 10 but received %d", state.InitProcessStartTime)
        }
        paths := state.CgroupPaths
        if paths == nil {
                t.Fatal("cgroup paths should not be nil")
        }
        if memPath := paths["memory"]; memPath != expectedMemoryPath {
                t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath)
        }
        for _, ns := range container.config.Namespaces {
                path := state.NamespacePaths[ns.Type]
                if path == "" {
                        t.Fatalf("expected non nil namespace path for %s", ns.Type)
                }
                if ns.Type == configs.NEWNET {
                        if path != expectedNetworkPath {
                                t.Fatalf("expected path %q but received %q", expectedNetworkPath, path)
                        }
                } else {
                        file := ""
                        switch ns.Type {
                        case configs.NEWNET:
                                file = "net"
                        case configs.NEWNS:
                                file = "mnt"
                        case configs.NEWPID:
                                file = "pid"
                        case configs.NEWIPC:
                                file = "ipc"
                        case configs.NEWUSER:
                                file = "user"
                        case configs.NEWUTS:
                                file = "uts"
                        case configs.NEWCGROUP:
                                file = "cgroup"
                        }
                        expected := fmt.Sprintf("/proc/%d/ns/%s", pid, file)
                        if expected != path {
                                t.Fatalf("expected path %q but received %q", expected, path)
                        }
                }
        }
}

func TestGetContainerStateAfterUpdate(t *testing.T) {
        pid := os.Getpid()
        stat, err := system.Stat(pid)
        if err != nil {
                t.Fatal(err)
        }

        container := &Container{
                stateDir: t.TempDir(),
                id:       "myid",
                config: &configs.Config{
                        Namespaces: []configs.Namespace{
                                {Type: configs.NEWPID},
                                {Type: configs.NEWNS},
                                {Type: configs.NEWNET},
                                {Type: configs.NEWUTS},
                                {Type: configs.NEWIPC},
                        },
                        Cgroups: &cgroups.Cgroup{
                                Resources: &cgroups.Resources{
                                        Memory: 1024,
                                },
                        },
                },
                initProcess: &mockProcess{
                        _pid:    pid,
                        started: stat.StartTime,
                },
                cgroupManager: &mockCgroupManager{},
        }
        container.state = &createdState{c: container}
        state, err := container.State()
        if err != nil {
                t.Fatal(err)
        }
        if state.InitProcessPid != pid {
                t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid)
        }
        if state.InitProcessStartTime != stat.StartTime {
                t.Fatalf("expected process start time %d but received %d", stat.StartTime, state.InitProcessStartTime)
        }
        if state.Config.Cgroups.Resources.Memory != 1024 {
                t.Fatalf("expected Memory to be 1024 but received %q", state.Config.Cgroups.Memory)
        }

        // Set initProcessStartTime so we fake to be running
        container.initProcessStartTime = state.InitProcessStartTime
        container.state = &runningState{c: container}
        newConfig := container.Config()
        newConfig.Cgroups.Resources.Memory = 2048
        if err := container.Set(newConfig); err != nil {
                t.Fatal(err)
        }
        state, err = container.State()
        if err != nil {
                t.Fatal(err)
        }
        if state.Config.Cgroups.Resources.Memory != 2048 {
                t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory)
        }
}

//go:build !runc_nocriu

package libcontainer

import (
        "bufio"
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "os"
        "os/exec"
        "path/filepath"
        "reflect"
        "strings"
        "time"

        "github.com/checkpoint-restore/go-criu/v6"
        criurpc "github.com/checkpoint-restore/go-criu/v6/rpc"
        securejoin "github.com/cyphar/filepath-securejoin"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
        "google.golang.org/protobuf/proto"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/utils"
)

var criuFeatures *criurpc.CriuFeatures

var ErrCriuMissingFeatures = errors.New("criu is missing features")

func (c *Container) checkCriuFeatures(criuOpts *CriuOpts, criuFeat *criurpc.CriuFeatures) error {
        t := criurpc.CriuReqType_FEATURE_CHECK

        // make sure the features we are looking for are really not from
        // some previous check
        criuFeatures = nil

        req := &criurpc.CriuReq{
                Type:     &t,
                Features: criuFeat,
        }

        err := c.criuSwrk(nil, req, criuOpts, nil)
        if err != nil {
                return fmt.Errorf("CRIU feature check failed: %w", err)
        }

        var missingFeatures []string

        // The outer if checks if the fields actually exist
        if (criuFeat.MemTrack != nil) &&
                (criuFeatures.MemTrack != nil) {
                // The inner if checks if they are set to true
                if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
                        missingFeatures = append(missingFeatures, "MemTrack")
                        logrus.Debugf("CRIU does not support MemTrack")
                }
        }

        // This needs to be repeated for every new feature check.
        // Is there a way to put this in a function. Reflection?
        if (criuFeat.LazyPages != nil) &&
                (criuFeatures.LazyPages != nil) {
                if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
                        missingFeatures = append(missingFeatures, "LazyPages")
                        logrus.Debugf("CRIU does not support LazyPages")
                }
        }

        if len(missingFeatures) != 0 {
                return fmt.Errorf("%w: %v", ErrCriuMissingFeatures, missingFeatures)
        }

        return nil
}

func compareCriuVersion(criuVersion int, minVersion int) error {
        // simple function to perform the actual version compare
        if criuVersion < minVersion {
                return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
        }

        return nil
}

// checkCriuVersion checks CRIU version greater than or equal to minVersion.
func (c *Container) checkCriuVersion(minVersion int) error {
        // If the version of criu has already been determined there is no need
        // to ask criu for the version again. Use the value from c.criuVersion.
        if c.criuVersion != 0 {
                return compareCriuVersion(c.criuVersion, minVersion)
        }

        criu := criu.MakeCriu()
        var err error
        c.criuVersion, err = criu.GetCriuVersion()
        if err != nil {
                return fmt.Errorf("CRIU version check failed: %w", err)
        }

        return compareCriuVersion(c.criuVersion, minVersion)
}

const descriptorsFilename = "descriptors.json"

func (c *Container) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
        mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
        if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
                mountDest = dest[len(c.config.Rootfs):]
        }
        extMnt := &criurpc.ExtMountMap{
                Key: proto.String(mountDest),
                Val: proto.String(mountDest),
        }
        req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}

func (c *Container) addMaskPaths(req *criurpc.CriuReq) error {
        for _, path := range c.config.MaskPaths {
                fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
                if err != nil {
                        if os.IsNotExist(err) {
                                continue
                        }
                        return err
                }
                if fi.IsDir() {
                        continue
                }

                extMnt := &criurpc.ExtMountMap{
                        Key: proto.String(path),
                        Val: proto.String("/dev/null"),
                }
                req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
        }
        return nil
}

func (c *Container) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
        // CRIU will evaluate a configuration starting with release 3.11.
        // Settings in the configuration file will overwrite RPC settings.
        // Look for annotations. The annotation 'org.criu.config'
        // specifies if CRIU should use a different, container specific
        // configuration file.
        configFile, exists := utils.SearchLabels(c.config.Labels, "org.criu.config")
        if exists {
                // If the annotation 'org.criu.config' exists and is set
                // to a non-empty string, tell CRIU to use that as a
                // configuration file. If the file does not exist, CRIU
                // will just ignore it.
                if configFile != "" {
                        rpcOpts.ConfigFile = proto.String(configFile)
                }
                // If 'org.criu.config' exists and is set to an empty
                // string, a runc specific CRIU configuration file will
                // be not set at all.
        } else {
                // If the mentioned annotation has not been found, specify
                // a default CRIU configuration file.
                rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
        }
}

func (c *Container) criuSupportsExtNS(t configs.NamespaceType) bool {
        var minVersion int
        switch t {
        case configs.NEWNET:
                // CRIU supports different external namespace with different released CRIU versions.
                // For network namespaces to work we need at least criu 3.11.0 => 31100.
                minVersion = 31100
        case configs.NEWPID:
                // For PID namespaces criu 31500 is needed.
                minVersion = 31500
        default:
                return false
        }
        return c.checkCriuVersion(minVersion) == nil
}

func criuNsToKey(t configs.NamespaceType) string {
        return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated
}

func (c *Container) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
        if !c.criuSupportsExtNS(t) {
                return fmt.Errorf("criu lacks support for external %s namespace during checkpointing process (old criu version?)", configs.NsName(t))
        }

        nsPath := c.config.Namespaces.PathOf(t)
        if nsPath == "" {
                return nil
        }
        // CRIU expects the information about an external namespace
        // like this: --external <TYPE>[<inode>]:<key>
        // This <key> is always 'extRoot<TYPE>NS'.
        var ns unix.Stat_t
        if err := unix.Stat(nsPath, &ns); err != nil {
                return err
        }
        criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
        rpcOpts.External = append(rpcOpts.External, criuExternal)

        return nil
}

func (c *Container) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
        for _, ns := range c.config.Namespaces {
                switch ns.Type {
                case configs.NEWNET, configs.NEWPID:
                        // If the container is running in a network or PID namespace and has
                        // a path to the network or PID namespace configured, we will dump
                        // that network or PID namespace as an external namespace and we
                        // will expect that the namespace exists during restore.
                        // This basically means that CRIU will ignore the namespace
                        // and expect it to be setup correctly.
                        if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
                                return err
                        }
                default:
                        // For all other namespaces except NET and PID CRIU has
                        // a simpler way of joining the existing namespace if set
                        nsPath := c.config.Namespaces.PathOf(ns.Type)
                        if nsPath == "" {
                                continue
                        }
                        if ns.Type == configs.NEWCGROUP {
                                // CRIU has no code to handle NEWCGROUP
                                return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
                        }
                        // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc

                        // CRIU will issue a warning for NEWUSER:
                        // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
                        rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
                                Ns:     proto.String(configs.NsName(ns.Type)),
                                NsFile: proto.String(nsPath),
                        })
                }
        }

        return nil
}

func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
        if !c.criuSupportsExtNS(t) {
                return fmt.Errorf("criu lacks support for external %s namespace during the restoration process (old criu version?)", configs.NsName(t))
        }

        nsPath := c.config.Namespaces.PathOf(t)
        if nsPath == "" {
                return nil
        }
        // CRIU wants the information about an existing namespace
        // like this: --inherit-fd fd[<fd>]:<key>
        // The <key> needs to be the same as during checkpointing.
        // We are always using 'extRoot<TYPE>NS' as the key in this.
        nsFd, err := os.Open(nsPath)
        if err != nil {
                logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
                return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
        }
        inheritFd := &criurpc.InheritFd{
                Key: proto.String(criuNsToKey(t)),
                // The offset of four is necessary because 0, 1, 2 and 3 are
                // already used by stdin, stdout, stderr, 'criu swrk' socket.
                Fd: proto.Int32(int32(4 + len(*extraFiles))),
        }
        rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
        // All open FDs need to be transferred to CRIU via extraFiles
        *extraFiles = append(*extraFiles, nsFd)

        return nil
}

func (c *Container) Checkpoint(criuOpts *CriuOpts) error {
        const logFile = "dump.log"
        c.m.Lock()
        defer c.m.Unlock()

        // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
        // (CLI prints a warning)
        // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
        //               support for doing unprivileged dumps, but the setup of
        //               rootless containers might make this complicated.

        // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
        if err := c.checkCriuVersion(30000); err != nil {
                return err
        }

        if criuOpts.ImagesDirectory == "" {
                return errors.New("invalid directory to save checkpoint")
        }

        cgMode, err := criuCgMode(criuOpts.ManageCgroupsMode)
        if err != nil {
                return err
        }

        // Since a container can be C/R'ed multiple times,
        // the checkpoint directory may already exist.
        if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
                return err
        }

        logDir := criuOpts.ImagesDirectory
        imageDir, err := os.Open(criuOpts.ImagesDirectory)
        if err != nil {
                return err
        }
        defer imageDir.Close()

        rpcOpts := criurpc.CriuOpts{
                ImagesDirFd:       proto.Int32(int32(imageDir.Fd())),
                LogLevel:          proto.Int32(4),
                LogFile:           proto.String(logFile),
                Root:              proto.String(c.config.Rootfs),
                ManageCgroups:     proto.Bool(true), // Obsoleted by ManageCgroupsMode.
                ManageCgroupsMode: &cgMode,
                NotifyScripts:     proto.Bool(true),
                Pid:               proto.Int32(int32(c.initProcess.pid())),
                ShellJob:          proto.Bool(criuOpts.ShellJob),
                LeaveRunning:      proto.Bool(criuOpts.LeaveRunning),
                TcpEstablished:    proto.Bool(criuOpts.TcpEstablished),
                TcpSkipInFlight:   proto.Bool(criuOpts.TcpSkipInFlight),
                LinkRemap:         proto.Bool(criuOpts.LinkRemap),
                ExtUnixSk:         proto.Bool(criuOpts.ExternalUnixConnections),
                FileLocks:         proto.Bool(criuOpts.FileLocks),
                EmptyNs:           proto.Uint32(criuOpts.EmptyNs),
                OrphanPtsMaster:   proto.Bool(true),
                AutoDedup:         proto.Bool(criuOpts.AutoDedup),
                LazyPages:         proto.Bool(criuOpts.LazyPages),
        }

        // if criuOpts.WorkDirectory is not set, criu default is used.
        if criuOpts.WorkDirectory != "" {
                if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
                        return err
                }
                workDir, err := os.Open(criuOpts.WorkDirectory)
                if err != nil {
                        return err
                }
                defer workDir.Close()
                rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
                logDir = criuOpts.WorkDirectory
        }

        c.handleCriuConfigurationFile(&rpcOpts)

        // If the container is running in a network namespace and has
        // a path to the network namespace configured, we will dump
        // that network namespace as an external namespace and we
        // will expect that the namespace exists during restore.
        // This basically means that CRIU will ignore the namespace
        // and expect to be setup correctly.
        if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
                return err
        }

        // Same for possible external PID namespaces
        if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
                return err
        }

        // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup
        // is not set, CRIU uses ptrace() to pause the processes.
        // Note cgroup v2 freezer is only supported since CRIU release 3.14.
        if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
                if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
                        rpcOpts.FreezeCgroup = proto.String(fcg)
                }
        }

        // append optional criu opts, e.g., page-server and port
        if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
                rpcOpts.Ps = &criurpc.CriuPageServerInfo{
                        Address: proto.String(criuOpts.PageServer.Address),
                        Port:    proto.Int32(criuOpts.PageServer.Port),
                }
        }

        // pre-dump may need parentImage param to complete iterative migration
        if criuOpts.ParentImage != "" {
                rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
                rpcOpts.TrackMem = proto.Bool(true)
        }

        var t criurpc.CriuReqType
        if criuOpts.PreDump {
                feat := criurpc.CriuFeatures{
                        MemTrack: proto.Bool(true),
                }

                if err := c.checkCriuFeatures(criuOpts, &feat); err != nil {
                        return err
                }

                t = criurpc.CriuReqType_PRE_DUMP
        } else {
                t = criurpc.CriuReqType_DUMP
        }

        if criuOpts.LazyPages {
                // lazy migration requested; check if criu supports it
                feat := criurpc.CriuFeatures{
                        LazyPages: proto.Bool(true),
                }
                if err := c.checkCriuFeatures(criuOpts, &feat); err != nil {
                        return err
                }

                if fd := criuOpts.StatusFd; fd != -1 {
                        // check that the FD is valid
                        flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
                        if err != nil {
                                return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
                        }
                        // and writable
                        if flags&unix.O_WRONLY == 0 {
                                return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
                        }

                        if c.checkCriuVersion(31500) != nil {
                                // For criu 3.15+, use notifications (see case "status-ready"
                                // in criuNotifications). Otherwise, rely on criu status fd.
                                rpcOpts.StatusFd = proto.Int32(int32(fd))
                        }
                }
        }

        req := &criurpc.CriuReq{
                Type: &t,
                Opts: &rpcOpts,
        }

        // no need to dump all this in pre-dump
        if !criuOpts.PreDump {
                hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
                for _, m := range c.config.Mounts {
                        switch m.Device {
                        case "bind":
                                c.addCriuDumpMount(req, m)
                        case "cgroup":
                                if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
                                        // real mount(s)
                                        continue
                                }
                                // a set of "external" bind mounts
                                binds, err := getCgroupMounts(m)
                                if err != nil {
                                        return err
                                }
                                for _, b := range binds {
                                        c.addCriuDumpMount(req, b)
                                }
                        }
                }

                if err := c.addMaskPaths(req); err != nil {
                        return err
                }

                for _, node := range c.config.Devices {
                        m := &configs.Mount{Destination: node.Path, Source: node.Path}
                        c.addCriuDumpMount(req, m)
                }

                // Write the FD info to a file in the image directory
                fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
                if err != nil {
                        return err
                }

                err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
                if err != nil {
                        return err
                }
        }

        err = c.criuSwrk(nil, req, criuOpts, nil)
        if err != nil {
                logCriuErrors(logDir, logFile)
                return err
        }
        return nil
}

func (c *Container) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
        mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
        if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
                mountDest = dest[len(c.config.Rootfs):]
        }
        extMnt := &criurpc.ExtMountMap{
                Key: proto.String(mountDest),
                Val: proto.String(m.Source),
        }
        req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}

func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
        for _, iface := range c.config.Networks {
                switch iface.Type {
                case "veth":
                        veth := new(criurpc.CriuVethPair)
                        veth.IfOut = proto.String(iface.HostInterfaceName)
                        veth.IfIn = proto.String(iface.Name)
                        req.Opts.Veths = append(req.Opts.Veths, veth)
                case "loopback":
                        // Do nothing
                }
        }
        for _, i := range criuOpts.VethPairs {
                veth := new(criurpc.CriuVethPair)
                veth.IfOut = proto.String(i.HostInterfaceName)
                veth.IfIn = proto.String(i.ContainerInterfaceName)
                req.Opts.Veths = append(req.Opts.Veths, veth)
        }
}

// makeCriuRestoreMountpoints makes the actual mountpoints for the
// restore using CRIU. This function is inspired from the code in
// rootfs_linux.go.
func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error {
        if m.Device == "cgroup" {
                // No mount point(s) need to be created:
                //
                // * for v1, mount points are saved by CRIU because
                //   /sys/fs/cgroup is a tmpfs mount
                //
                // * for v2, /sys/fs/cgroup is a real mount, but
                //   the mountpoint appears as soon as /sys is mounted
                return nil
        }
        // TODO: pass srcFD? Not sure if criu is impacted by issue #2484.
        me := mountEntry{Mount: m}
        // For all other filesystems, just make the target.
        if _, err := createMountpoint(c.config.Rootfs, me); err != nil {
                return fmt.Errorf("create criu restore mountpoint for %s mount: %w", me.Destination, err)
        }
        return nil
}

// isPathInPrefixList is a small function for CRIU restore to make sure
// mountpoints, which are on a tmpfs, are not created in the roofs.
func isPathInPrefixList(path string, prefix []string) bool {
        for _, p := range prefix {
                if strings.HasPrefix(path, p+"/") {
                        return true
                }
        }
        return false
}

// prepareCriuRestoreMounts tries to set up the rootfs of the
// container to be restored in the same way runc does it for
// initial container creation. Even for a read-only rootfs container
// runc modifies the rootfs to add mountpoints which do not exist.
// This function also creates missing mountpoints as long as they
// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
        // First get a list of a all tmpfs mounts
        tmpfs := []string{}
        for _, m := range mounts {
                switch m.Device {
                case "tmpfs":
                        tmpfs = append(tmpfs, m.Destination)
                }
        }
        // Now go through all mounts and create the mountpoints
        // if the mountpoints are not on a tmpfs, as CRIU will
        // restore the complete tmpfs content from its checkpoint.
        umounts := []string{}
        defer func() {
                for _, u := range umounts {
                        _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
                                if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
                                        if e != unix.EINVAL {
                                                // Ignore EINVAL as it means 'target is not a mount point.'
                                                // It probably has already been unmounted.
                                                logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
                                        }
                                }
                                return nil
                        })
                }
        }()
        for _, m := range mounts {
                if !isPathInPrefixList(m.Destination, tmpfs) {
                        if err := c.makeCriuRestoreMountpoints(m); err != nil {
                                return err
                        }
                        // If the mount point is a bind mount, we need to mount
                        // it now so that runc can create the necessary mount
                        // points for mounts in bind mounts.
                        // This also happens during initial container creation.
                        // Without this CRIU restore will fail
                        // See: https://github.com/opencontainers/runc/issues/2748
                        // It is also not necessary to order the mount points
                        // because during initial container creation mounts are
                        // set up in the order they are configured.
                        if m.Device == "bind" {
                                if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
                                        return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
                                }); err != nil {
                                        return err
                                }
                                umounts = append(umounts, m.Destination)
                        }
                }
        }
        return nil
}

// Restore restores the checkpointed container to a running state using the
// criu(8) utility.
func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error {
        const logFile = "restore.log"
        c.m.Lock()
        defer c.m.Unlock()

        var extraFiles []*os.File

        // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
        // (CLI prints a warning)
        // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
        //               support for unprivileged restore at the moment.

        // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
        if err := c.checkCriuVersion(30000); err != nil {
                return err
        }
        if criuOpts.ImagesDirectory == "" {
                return errors.New("invalid directory to restore checkpoint")
        }

        cgMode, err := criuCgMode(criuOpts.ManageCgroupsMode)
        if err != nil {
                return err
        }

        logDir := criuOpts.ImagesDirectory
        imageDir, err := os.Open(criuOpts.ImagesDirectory)
        if err != nil {
                return err
        }
        defer imageDir.Close()
        // CRIU has a few requirements for a root directory:
        // * it must be a mount point
        // * its parent must not be overmounted
        // c.config.Rootfs is bind-mounted to a temporary directory
        // to satisfy these requirements.
        root := filepath.Join(c.stateDir, "criu-root")
        if err := os.Mkdir(root, 0o755); err != nil {
                return err
        }
        defer os.Remove(root)
        root, err = filepath.EvalSymlinks(root)
        if err != nil {
                return err
        }
        err = mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
        if err != nil {
                return err
        }
        defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
        t := criurpc.CriuReqType_RESTORE
        req := &criurpc.CriuReq{
                Type: &t,
                Opts: &criurpc.CriuOpts{
                        ImagesDirFd:       proto.Int32(int32(imageDir.Fd())),
                        EvasiveDevices:    proto.Bool(true),
                        LogLevel:          proto.Int32(4),
                        LogFile:           proto.String(logFile),
                        RstSibling:        proto.Bool(true),
                        Root:              proto.String(root),
                        ManageCgroups:     proto.Bool(true), // Obsoleted by ManageCgroupsMode.
                        ManageCgroupsMode: &cgMode,
                        NotifyScripts:     proto.Bool(true),
                        ShellJob:          proto.Bool(criuOpts.ShellJob),
                        ExtUnixSk:         proto.Bool(criuOpts.ExternalUnixConnections),
                        TcpEstablished:    proto.Bool(criuOpts.TcpEstablished),
                        FileLocks:         proto.Bool(criuOpts.FileLocks),
                        EmptyNs:           proto.Uint32(criuOpts.EmptyNs),
                        OrphanPtsMaster:   proto.Bool(true),
                        AutoDedup:         proto.Bool(criuOpts.AutoDedup),
                        LazyPages:         proto.Bool(criuOpts.LazyPages),
                },
        }

        if criuOpts.LsmProfile != "" {
                // CRIU older than 3.16 has a bug which breaks the possibility
                // to set a different LSM profile.
                if err := c.checkCriuVersion(31600); err != nil {
                        return errors.New("--lsm-profile requires at least CRIU 3.16")
                }
                req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
        }
        if criuOpts.LsmMountContext != "" {
                if err := c.checkCriuVersion(31600); err != nil {
                        return errors.New("--lsm-mount-context requires at least CRIU 3.16")
                }
                req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
        }

        if criuOpts.WorkDirectory != "" {
                // Since a container can be C/R'ed multiple times,
                // the work directory may already exist.
                if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
                        return err
                }
                workDir, err := os.Open(criuOpts.WorkDirectory)
                if err != nil {
                        return err
                }
                defer workDir.Close()
                req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
                logDir = criuOpts.WorkDirectory
        }
        c.handleCriuConfigurationFile(req.Opts)

        if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
                return err
        }

        // This will modify the rootfs of the container in the same way runc
        // modifies the container during initial creation.
        if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
                return err
        }

        hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
        for _, m := range c.config.Mounts {
                switch m.Device {
                case "bind":
                        c.addCriuRestoreMount(req, m)
                case "cgroup":
                        if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
                                continue
                        }
                        // cgroup v1 is a set of bind mounts, unless cgroupns is used
                        binds, err := getCgroupMounts(m)
                        if err != nil {
                                return err
                        }
                        for _, b := range binds {
                                c.addCriuRestoreMount(req, b)
                        }
                }
        }

        if len(c.config.MaskPaths) > 0 {
                m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
                c.addCriuRestoreMount(req, m)
        }

        for _, node := range c.config.Devices {
                m := &configs.Mount{Destination: node.Path, Source: node.Path}
                c.addCriuRestoreMount(req, m)
        }

        if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
                c.restoreNetwork(req, criuOpts)
        }

        var (
                fds    []string
                fdJSON []byte
        )
        if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
                return err
        }

        if err := json.Unmarshal(fdJSON, &fds); err != nil {
                return err
        }
        for i := range fds {
                if s := fds[i]; strings.Contains(s, "pipe:") {
                        inheritFd := new(criurpc.InheritFd)
                        inheritFd.Key = proto.String(s)
                        inheritFd.Fd = proto.Int32(int32(i))
                        req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
                }
        }
        err = c.criuSwrk(process, req, criuOpts, extraFiles)
        if err != nil {
                logCriuErrors(logDir, logFile)
        }

        // Now that CRIU is done let's close all opened FDs CRIU needed.
        for _, fd := range extraFiles {
                fd.Close()
        }

        return err
}

// logCriuErrors tries to find and log errors from a criu log file.
// The output is similar to what "grep -n -B5 Error" does.
func logCriuErrors(dir, file string) {
        lookFor := []byte("Error") // Print the line that contains this...
        const max = 5 + 1          // ... and a few preceding lines.

        logFile := filepath.Join(dir, file)
        f, err := os.Open(logFile)
        if err != nil {
                logrus.Warn(err)
                return
        }
        defer f.Close()

        var lines [max][]byte
        var idx, lineNo, printedLineNo int
        s := bufio.NewScanner(f)
        for s.Scan() {
                lineNo++
                lines[idx] = s.Bytes()
                idx = (idx + 1) % max
                if !bytes.Contains(s.Bytes(), lookFor) {
                        continue
                }
                // Found an error.
                if printedLineNo == 0 {
                        logrus.Warnf("--- Quoting %q", logFile)
                } else if lineNo-max > printedLineNo {
                        // Mark the gap.
                        logrus.Warn("...")
                }
                // Print the last lines.
                for add := 0; add < max; add++ {
                        i := (idx + add) % max
                        s := lines[i]
                        actLineNo := lineNo + add - max + 1
                        if len(s) > 0 && actLineNo > printedLineNo {
                                logrus.Warnf("%d:%s", actLineNo, s)
                                printedLineNo = actLineNo
                        }
                }
        }
        if printedLineNo != 0 {
                logrus.Warn("---") // End of "Quoting ...".
        }
        if err := s.Err(); err != nil {
                logrus.Warnf("read %q: %v", logFile, err)
        }
}

func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
        // need to apply cgroups only on restore
        if req.GetType() != criurpc.CriuReqType_RESTORE {
                return nil
        }

        // XXX: Do we need to deal with this case? AFAIK criu still requires root.
        if err := c.cgroupManager.Apply(pid); err != nil {
                return err
        }

        if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
                return err
        }

        // TODO(@kolyshkin): should we use c.cgroupManager.GetPaths()
        // instead of reading /proc/pid/cgroup?
        path := fmt.Sprintf("/proc/%d/cgroup", pid)
        cgroupsPaths, err := cgroups.ParseCgroupFile(path)
        if err != nil {
                return err
        }

        for c, p := range cgroupsPaths {
                cgroupRoot := &criurpc.CgroupRoot{
                        Ctrl: proto.String(c),
                        Path: proto.String(p),
                }
                req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
        }

        return nil
}

func (c *Container) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
        fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
        if err != nil {
                return err
        }

        criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
        criuClientFileCon, err := net.FileConn(criuClient)
        criuClient.Close()
        if err != nil {
                return err
        }

        criuClientCon := criuClientFileCon.(*net.UnixConn)
        defer criuClientCon.Close()

        criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
        defer criuServer.Close()

        if c.criuVersion != 0 {
                // If the CRIU Version is still '0' then this is probably
                // the initial CRIU run to detect the version. Skip it.
                logrus.Debugf("Using CRIU %d", c.criuVersion)
        }
        cmd := exec.Command("criu", "swrk", "3")
        if process != nil {
                cmd.Stdin = process.Stdin
                cmd.Stdout = process.Stdout
                cmd.Stderr = process.Stderr
        }
        cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
        if extraFiles != nil {
                cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
        }

        if err := cmd.Start(); err != nil {
                return err
        }
        // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang.
        criuServer.Close()
        // cmd.Process will be replaced by a restored init.
        criuProcess := cmd.Process

        var criuProcessState *os.ProcessState
        defer func() {
                if criuProcessState == nil {
                        criuClientCon.Close()
                        _, err := criuProcess.Wait()
                        if err != nil {
                                logrus.Warnf("wait on criuProcess returned %v", err)
                        }
                }
        }()

        if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
                return err
        }

        var extFds []string
        if process != nil {
                extFds, err = getPipeFds(criuProcess.Pid)
                if err != nil {
                        return err
                }
        }

        logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
        // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
        // should be empty. For older CRIU versions it still will be
        // available but empty. criurpc.CriuReqType_VERSION actually
        // has no req.GetOpts().
        if logrus.GetLevel() >= logrus.DebugLevel &&
                !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
                        req.GetType() == criurpc.CriuReqType_VERSION) {

                val := reflect.ValueOf(req.GetOpts())
                v := reflect.Indirect(val)
                for i := 0; i < v.NumField(); i++ {
                        st := v.Type()
                        name := st.Field(i).Name
                        if 'A' <= name[0] && name[0] <= 'Z' {
                                value := val.MethodByName("Get" + name).Call([]reflect.Value{})
                                logrus.Debugf("CRIU option %s with value %v", name, value[0])
                        }
                }
        }
        data, err := proto.Marshal(req)
        if err != nil {
                return err
        }
        _, err = criuClientCon.Write(data)
        if err != nil {
                return err
        }

        buf := make([]byte, 10*4096)
        oob := make([]byte, 4096)
        for {
                n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
                if req.Opts != nil && req.Opts.StatusFd != nil {
                        // Close status_fd as soon as we got something back from criu,
                        // assuming it has consumed (reopened) it by this time.
                        // Otherwise it will might be left open forever and whoever
                        // is waiting on it will wait forever.
                        fd := int(*req.Opts.StatusFd)
                        _ = unix.Close(fd)
                        req.Opts.StatusFd = nil
                }
                if err != nil {
                        return err
                }
                if n == 0 {
                        return errors.New("unexpected EOF")
                }
                if n == len(buf) {
                        return errors.New("buffer is too small")
                }

                resp := new(criurpc.CriuResp)
                err = proto.Unmarshal(buf[:n], resp)
                if err != nil {
                        return err
                }
                t := resp.GetType()
                if !resp.GetSuccess() {
                        return fmt.Errorf("criu failed: type %s errno %d", t, resp.GetCrErrno())
                }

                switch t {
                case criurpc.CriuReqType_FEATURE_CHECK:
                        logrus.Debugf("Feature check says: %s", resp)
                        criuFeatures = resp.GetFeatures()
                case criurpc.CriuReqType_NOTIFY:
                        if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
                                return err
                        }
                        req = &criurpc.CriuReq{
                                Type:          &t,
                                NotifySuccess: proto.Bool(true),
                        }
                        data, err = proto.Marshal(req)
                        if err != nil {
                                return err
                        }
                        _, err = criuClientCon.Write(data)
                        if err != nil {
                                return err
                        }
                        continue
                case criurpc.CriuReqType_RESTORE:
                case criurpc.CriuReqType_DUMP:
                case criurpc.CriuReqType_PRE_DUMP:
                default:
                        return fmt.Errorf("unable to parse the response %s", resp.String())
                }

                break
        }

        _ = criuClientCon.CloseWrite()
        // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
        // Here we want to wait only the CRIU process.
        criuProcessState, err = criuProcess.Wait()
        if err != nil {
                return err
        }

        // In pre-dump mode CRIU is in a loop and waits for
        // the final DUMP command.
        // The current runc pre-dump approach, however, is
        // start criu in PRE_DUMP once for a single pre-dump
        // and not the whole series of pre-dump, pre-dump, ...m, dump
        // If we got the message CriuReqType_PRE_DUMP it means
        // CRIU was successful and we need to forcefully stop CRIU
        if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
                return fmt.Errorf("criu failed: %s", criuProcessState)
        }
        return nil
}

// lockNetwork blocks any external network activity.
func lockNetwork(config *configs.Config) error {
        for _, config := range config.Networks {
                strategy, err := getStrategy(config.Type)
                if err != nil {
                        return err
                }

                if err := strategy.detach(config); err != nil {
                        return err
                }
        }
        return nil
}

func unlockNetwork(config *configs.Config) error {
        for _, config := range config.Networks {
                strategy, err := getStrategy(config.Type)
                if err != nil {
                        return err
                }
                if err = strategy.attach(config); err != nil {
                        return err
                }
        }
        return nil
}

func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
        notify := resp.GetNotify()
        if notify == nil {
                return fmt.Errorf("invalid response: %s", resp.String())
        }
        script := notify.GetScript()
        logrus.Debugf("notify: %s\n", script)
        switch script {
        case "post-dump":
                f, err := os.Create(filepath.Join(c.stateDir, "checkpoint"))
                if err != nil {
                        return err
                }
                f.Close()
        case "network-unlock":
                if err := unlockNetwork(c.config); err != nil {
                        return err
                }
        case "network-lock":
                if err := lockNetwork(c.config); err != nil {
                        return err
                }
        case "setup-namespaces":
                if c.config.HasHook(configs.Prestart, configs.CreateRuntime) {
                        s, err := c.currentOCIState()
                        if err != nil {
                                return nil
                        }
                        s.Pid = int(notify.GetPid())

                        if err := c.config.Hooks.Run(configs.Prestart, s); err != nil {
                                return err
                        }
                        if err := c.config.Hooks.Run(configs.CreateRuntime, s); err != nil {
                                return err
                        }
                }
        case "post-restore":
                pid := notify.GetPid()

                p, err := os.FindProcess(int(pid))
                if err != nil {
                        return err
                }
                cmd.Process = p

                r, err := newRestoredProcess(cmd, fds)
                if err != nil {
                        return err
                }
                process.ops = r
                if err := c.state.transition(&restoredState{
                        imageDir: opts.ImagesDirectory,
                        c:        c,
                }); err != nil {
                        return err
                }
                // create a timestamp indicating when the restored checkpoint was started
                c.created = time.Now().UTC()
                if _, err := c.updateState(r); err != nil {
                        return err
                }
                if err := os.Remove(filepath.Join(c.stateDir, "checkpoint")); err != nil {
                        if !os.IsNotExist(err) {
                                logrus.Error(err)
                        }
                }
        case "orphan-pts-master":
                scm, err := unix.ParseSocketControlMessage(oob)
                if err != nil {
                        return err
                }
                fds, err := unix.ParseUnixRights(&scm[0])
                if err != nil {
                        return err
                }

                master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
                defer master.Close()

                // While we can access console.master, using the API is a good idea.
                if err := utils.SendFile(process.ConsoleSocket, master); err != nil {
                        return err
                }
        case "status-ready":
                if opts.StatusFd != -1 {
                        // write \0 to status fd to notify that lazy page server is ready
                        _, err := unix.Write(opts.StatusFd, []byte{0})
                        if err != nil {
                                logrus.Warnf("can't write \\0 to status fd: %v", err)
                        }
                        _ = unix.Close(opts.StatusFd)
                        opts.StatusFd = -1
                }
        }
        return nil
}

func criuCgMode(mode string) (criurpc.CriuCgMode, error) {
        switch mode {
        case "":
                return criurpc.CriuCgMode_DEFAULT, nil
        case "soft":
                return criurpc.CriuCgMode_SOFT, nil
        case "full":
                return criurpc.CriuCgMode_FULL, nil
        case "strict":
                return criurpc.CriuCgMode_STRICT, nil
        case "ignore":
                return criurpc.CriuCgMode_IGNORE, nil
        default:
                return 0, errors.New("invalid manage-cgroups-mode value")
        }
}

package libcontainer

import (
        "errors"
        "fmt"
        "os"
        "slices"
        "strings"

        "github.com/moby/sys/user"
        "github.com/sirupsen/logrus"
)

// prepareEnv processes a list of environment variables, preparing it
// for direct consumption by unix.Exec. In particular, it:
//   - validates each variable is in the NAME=VALUE format and
//     contains no \0 (nil) bytes;
//   - removes any duplicates (keeping only the last value for each key)
//   - sets PATH for the current process, if found in the list;
//   - adds HOME to returned environment, if not found in the list.
//
// Returns the prepared environment.
func prepareEnv(env []string, uid int) ([]string, error) {
        if env == nil {
                return nil, nil
        }
        // Deduplication code based on dedupEnv from Go 1.22 os/exec.

        // Construct the output in reverse order, to preserve the
        // last occurrence of each key.
        out := make([]string, 0, len(env))
        saw := make(map[string]bool, len(env))
        for n := len(env); n > 0; n-- {
                kv := env[n-1]
                i := strings.IndexByte(kv, '=')
                if i == -1 {
                        return nil, errors.New("invalid environment variable: missing '='")
                }
                if i == 0 {
                        return nil, errors.New("invalid environment variable: name cannot be empty")
                }
                key := kv[:i]
                if saw[key] { // Duplicate.
                        continue
                }
                saw[key] = true
                if strings.IndexByte(kv, 0) >= 0 {
                        return nil, fmt.Errorf("invalid environment variable %q: contains nul byte (\\x00)", key)
                }
                if key == "PATH" {
                        // Needs to be set as it is used for binary lookup.
                        if err := os.Setenv("PATH", kv[i+1:]); err != nil {
                                return nil, err
                        }
                }
                out = append(out, kv)
        }
        // Restore the original order.
        slices.Reverse(out)

        // If HOME is not found in env, get it from container's /etc/passwd and add.
        if !saw["HOME"] {
                home, err := getUserHome(uid)
                if err != nil {
                        // For backward compatibility, don't return an error, but merely log it.
                        logrus.WithError(err).Debugf("HOME not set in process.env, and getting UID %d homedir failed", uid)
                }

                out = append(out, "HOME="+home)
        }

        return out, nil
}

func getUserHome(uid int) (string, error) {
        const defaultHome = "/" // Default value, return this with any error.

        u, err := user.LookupUid(uid)
        if err != nil {
                // ErrNoPasswdEntries is kinda expected as any UID can be specified.
                if errors.Is(err, user.ErrNoPasswdEntries) {
                        err = nil
                }
                return defaultHome, err
        }

        return u.Home, nil
}

package exeseal

import (
        "errors"
        "fmt"
        "io"
        "os"
        "strconv"

        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/system"
)

type SealFunc func(**os.File) error

var (
        _ SealFunc = sealMemfd
        _ SealFunc = sealFile
)

func isExecutable(f *os.File) bool {
        if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil {
                return true
        } else if err == unix.EACCES {
                return false
        }
        path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd()))
        if err := unix.Access(path, unix.X_OK); err == nil {
                return true
        } else if err == unix.EACCES {
                return false
        }
        // Cannot check -- assume it's executable (if not, exec will fail).
        logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name())
        return true
}

const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE

func sealMemfd(f **os.File) error {
        if err := (*f).Chmod(0o511); err != nil {
                return err
        }
        // Try to set the newer memfd sealing flags, but we ignore
        // errors because they are not needed and we want to continue
        // to work on older kernels.
        fd := (*f).Fd()

        // Skip F_SEAL_FUTURE_WRITE, it is not needed because we alreadu use the
        // stronger F_SEAL_WRITE (and is buggy on Linux <5.5 -- see kernel commit
        // 05d351102dbe and <https://github.com/opencontainers/runc/pull/4640>).

        // F_SEAL_EXEC -- Linux 6.3
        const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
        _, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)

        // Apply all original memfd seals.
        _, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
        return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
}

// Memfd creates a sealable executable memfd (supported since Linux 3.17).
func Memfd(comment string) (*os.File, SealFunc, error) {
        file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
        return file, sealMemfd, err
}

func sealFile(f **os.File) error {
        // When sealing an O_TMPFILE-style descriptor we need to
        // re-open the path as O_PATH to clear the existing write
        // handle we have.
        opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
        if err != nil {
                return fmt.Errorf("reopen tmpfile: %w", err)
        }
        _ = (*f).Close()
        *f = opath
        return nil
}

// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
// since Linux 3.11).
func otmpfile(dir string) (*os.File, SealFunc, error) {
        file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
        if err != nil {
                return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
        }
        // Make sure we actually got an unlinked O_TMPFILE descriptor.
        var stat unix.Stat_t
        if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
                file.Close()
                return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
        } else if stat.Nlink != 0 {
                file.Close()
                return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
        }
        return file, sealFile, err
}

// mktemp creates a classic unlinked file in the given directory.
func mktemp(dir string) (*os.File, SealFunc, error) {
        file, err := os.CreateTemp(dir, "runc.")
        if err != nil {
                return nil, nil, err
        }
        // Unlink the file and verify it was unlinked.
        if err := os.Remove(file.Name()); err != nil {
                return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
        }
        if err := file.Chmod(0o511); err != nil {
                return nil, nil, fmt.Errorf("chmod classic tmpfile: %w", err)
        }
        var stat unix.Stat_t
        if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
                return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
        } else if stat.Nlink != 0 {
                return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
        }
        return file, sealFile, err
}

func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
        // First, try an executable memfd (supported since Linux 3.17).
        file, sealFn, err = Memfd(comment)
        if err == nil {
                return
        }
        logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)

        // The tmpDir here (c.root) might be mounted noexec, so we need a couple of
        // fallbacks to try. It's possible that none of these are writable and
        // executable, in which case there's nothing we can practically do (other
        // than mounting our own executable tmpfs, which would have its own
        // issues).
        tmpDirs := []string{
                tmpDir,
                os.TempDir(),
                "/tmp",
                ".",
                "/bin",
                "/",
        }

        // Try to fallback to O_TMPFILE (supported since Linux 3.11).
        for _, dir := range tmpDirs {
                file, sealFn, err = otmpfile(dir)
                if err != nil {
                        continue
                }
                if !isExecutable(file) {
                        logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
                        file.Close()
                        continue
                }
                return
        }
        logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
        // Finally, try a classic unlinked temporary file.
        for _, dir := range tmpDirs {
                file, sealFn, err = mktemp(dir)
                if err != nil {
                        continue
                }
                if !isExecutable(file) {
                        logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
                        file.Close()
                        continue
                }
                return
        }
        return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
}

// CloneBinary creates a "sealed" clone of a given binary, which can be used to
// thwart attempts by the container process to gain access to host binaries
// through procfs magic-link shenanigans. For more details on why this is
// necessary, see CVE-2019-5736.
func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
        logrus.Debugf("cloning %s binary (%d bytes)", name, size)
        file, sealFn, err := getSealableFile(name, tmpDir)
        if err != nil {
                return nil, err
        }
        copied, err := system.Copy(file, src)
        if err != nil {
                file.Close()
                return nil, fmt.Errorf("copy binary: %w", err)
        } else if copied != size {
                file.Close()
                return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
        }
        if err := sealFn(&file); err != nil {
                file.Close()
                return nil, fmt.Errorf("could not seal fd: %w", err)
        }
        return file, nil
}

// IsCloned returns whether the given file can be guaranteed to be a safe exe.
func IsCloned(exe *os.File) bool {
        seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
        if err != nil {
                // /proc/self/exe is probably not a memfd
                logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
                return false
        }
        // The memfd must have all of the base seals applied.
        logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
        return seals&baseMemfdSeals == baseMemfdSeals
}

// CloneSelfExe makes a clone of the current process's binary (through
// /proc/self/exe). This binary can then be used for "runc init" in order to
// make sure the container process can never resolve the original runc binary.
// For more details on why this is necessary, see CVE-2019-5736.
func CloneSelfExe(tmpDir string) (*os.File, error) {
        // Try to create a temporary overlayfs to produce a readonly version of
        // /proc/self/exe that cannot be "unwrapped" by the container. In contrast
        // to CloneBinary, this technique does not require any extra memory usage
        // and does not have the (fairly noticeable) performance impact of copying
        // a large binary file into a memfd.
        //
        // Based on some basic performance testing, the overlayfs approach has
        // effectively no performance overhead (it is on par with both
        // MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
        // around ~60% overhead during container startup.
        overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
        if err == nil {
                logrus.Debug("runc exeseal: using overlayfs for sealed /proc/self/exe") // used for tests
                return overlayFile, nil
        }
        logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")

        selfExe, err := os.Open("/proc/self/exe")
        if err != nil {
                return nil, fmt.Errorf("opening current binary: %w", err)
        }
        defer selfExe.Close()

        stat, err := selfExe.Stat()
        if err != nil {
                return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
        }
        size := stat.Size()

        return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
}

// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
// be guaranteed to be safe. This means that it must be a sealed memfd. Other
// types of clones cannot be completely verified as safe.
func IsSelfExeCloned() bool {
        selfExe, err := os.Open("/proc/self/exe")
        if err != nil {
                logrus.Debugf("open /proc/self/exe failed: %v", err)
                return false
        }
        defer selfExe.Close()
        return IsCloned(selfExe)
}

package exeseal

import (
        "fmt"
        "os"
        "path/filepath"
        "runtime"
        "strings"

        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/utils"
)

func fsopen(fsName string, flags int) (*os.File, error) {
        // Make sure we always set O_CLOEXEC.
        flags |= unix.FSOPEN_CLOEXEC
        fd, err := unix.Fsopen(fsName, flags)
        if err != nil {
                return nil, os.NewSyscallError("fsopen "+fsName, err)
        }
        return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
}

func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
        // Make sure we always set O_CLOEXEC.
        flags |= unix.FSMOUNT_CLOEXEC
        fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
        if err != nil {
                return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
        }
        runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
        return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
}

func escapeOverlayLowerDir(path string) string {
        // If the lowerdir path contains ":" we need to escape them, and if there
        // were any escape characters already (\) we need to escape those first.
        return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
}

// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
// uses the directory containing the binary as a lowerdir and a temporary tmpfs
// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
// and so we can create a safe zero-copy sealed version of /proc/self/exe.
// This only works for privileged users and on kernels with overlayfs and
// fsopen() enabled.
//
// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
// it is technically possible to create an overlayfs even for rootless
// containers. Unfortunately, this would require some ugly manual CGo+fork
// magic so we can do this later if we feel it's really needed.
func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
        // Try to do the superblock creation first to bail out early if we can't
        // use this method.
        overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
        if err != nil {
                return nil, err
        }
        defer overlayCtx.Close()

        // binPath is going to be /proc/self/exe, so do a readlink to get the real
        // path. overlayfs needs the real underlying directory for this protection
        // mode to work properly.
        if realPath, err := os.Readlink(binPath); err == nil {
                binPath = realPath
        }
        binLowerDirPath, binName := filepath.Split(binPath)
        // Escape any ":"s or "\"s in the path.
        binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)

        // Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
        // where writes are completely blocked. Ideally we would create a dummy
        // tmpfs for this, but it turns out that overlayfs doesn't allow for
        // anonymous mountns paths.
        // NOTE: I'm working on a patch to fix this but it won't be backported.
        dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)

        // Configure the lowerdirs. The binary lowerdir needs to be on the top to
        // ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
        // mask the binary.
        lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
        if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
                return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
        }

        // We don't care about xino (Linux 4.17) but it will be auto-enabled on
        // some systems (if /run/runc and /usr/bin are on different filesystems)
        // and this produces spurious dmesg log entries. We can safely ignore
        // errors when disabling this because we don't actually care about the
        // setting and we're just opportunistically disabling it.
        _ = unix.FsconfigSetString(int(overlayCtx.Fd()), "xino", "off")

        // Get an actual handle to the overlayfs.
        if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
                return nil, os.NewSyscallError("fsconfig create overlayfs", err)
        }
        overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
        if err != nil {
                return nil, err
        }
        defer overlayFd.Close()

        // Grab a handle to the binary through overlayfs.
        exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
        if err != nil {
                return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
        }
        // NOTE: We would like to check that exeFile is the same as /proc/self/exe,
        // except this is a little difficult. Depending on what filesystems the
        // layers are on, overlayfs can remap the inode numbers (and it always
        // creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
        // basic stat-based check. The only reasonable option would be to hash both
        // files and compare them, but this would require fully reading both files
        // which would produce a similar performance overhead to memfd cloning.
        //
        // Ultimately, there isn't a real attack to be worried about here. An
        // attacker would need to be able to modify files in /usr/sbin (or wherever
        // runc lives), at which point they could just replace the runc binary with
        // something malicious anyway.
        return exeFile, nil
}

package libcontainer

import (
        "encoding/json"
        "errors"
        "fmt"
        "os"

        securejoin "github.com/cyphar/filepath-securejoin"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/manager"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/configs/validate"
        "github.com/opencontainers/runc/libcontainer/intelrdt"
        "github.com/opencontainers/runc/libcontainer/utils"
)

const (
        stateFilename    = "state.json"
        execFifoFilename = "exec.fifo"
)

// Create creates a new container with the given id inside a given state
// directory (root), and returns a Container object.
//
// The root is a state directory which many containers can share. It can be
// used later to get the list of containers, or to get information about a
// particular container (see Load).
//
// The id must not be empty and consist of only the following characters:
// ASCII letters, digits, underscore, plus, minus, period. The id must be
// unique and non-existent for the given root path.
func Create(root, id string, config *configs.Config) (*Container, error) {
        if root == "" {
                return nil, errors.New("root not set")
        }
        if err := validateID(id); err != nil {
                return nil, err
        }
        if err := validate.Validate(config); err != nil {
                return nil, err
        }
        if err := os.MkdirAll(root, 0o700); err != nil {
                return nil, err
        }
        stateDir, err := securejoin.SecureJoin(root, id)
        if err != nil {
                return nil, err
        }
        if _, err := os.Stat(stateDir); err == nil {
                return nil, ErrExist
        } else if !os.IsNotExist(err) {
                return nil, err
        }

        cm, err := manager.New(config.Cgroups)
        if err != nil {
                return nil, err
        }

        // Check that cgroup does not exist or empty (no processes).
        // Note for cgroup v1 this check is not thorough, as there are multiple
        // separate hierarchies, while both Exists() and GetAllPids() only use
        // one for "devices" controller (assuming others are the same, which is
        // probably true in almost all scenarios). Checking all the hierarchies
        // would be too expensive.
        if cm.Exists() {
                pids, err := cm.GetAllPids()
                // Reading PIDs can race with cgroups removal, so ignore ENOENT and ENODEV.
                if err != nil && !errors.Is(err, os.ErrNotExist) && !errors.Is(err, unix.ENODEV) {
                        return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err)
                }
                if len(pids) != 0 {
                        return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids))
                }
        }

        // Check that cgroup is not frozen. Do not use Exists() here
        // since in cgroup v1 it only checks "devices" controller.
        st, err := cm.GetFreezerState()
        if err != nil {
                return nil, fmt.Errorf("unable to get cgroup freezer state: %w", err)
        }
        if st == cgroups.Frozen {
                return nil, errors.New("container's cgroup unexpectedly frozen")
        }

        // Parent directory is already created above, so Mkdir is enough.
        if err := os.Mkdir(stateDir, 0o711); err != nil {
                return nil, err
        }
        c := &Container{
                id:              id,
                stateDir:        stateDir,
                config:          config,
                cgroupManager:   cm,
                intelRdtManager: intelrdt.NewManager(config, id, ""),
        }
        c.state = &stoppedState{c: c}
        return c, nil
}

// Load takes a path to the state directory (root) and an id of an existing
// container, and returns a Container object reconstructed from the saved
// state. This presents a read only view of the container.
func Load(root, id string) (*Container, error) {
        if root == "" {
                return nil, errors.New("root not set")
        }
        // when load, we need to check id is valid or not.
        if err := validateID(id); err != nil {
                return nil, err
        }
        stateDir, err := securejoin.SecureJoin(root, id)
        if err != nil {
                return nil, err
        }
        state, err := loadState(stateDir)
        if err != nil {
                return nil, err
        }
        r := &nonChildProcess{
                processPid:       state.InitProcessPid,
                processStartTime: state.InitProcessStartTime,
                fds:              state.ExternalDescriptors,
        }
        cm, err := manager.NewWithPaths(state.Config.Cgroups, state.CgroupPaths)
        if err != nil {
                return nil, err
        }
        c := &Container{
                initProcess:          r,
                initProcessStartTime: state.InitProcessStartTime,
                id:                   id,
                config:               &state.Config,
                cgroupManager:        cm,
                intelRdtManager:      intelrdt.NewManager(&state.Config, id, state.IntelRdtPath),
                stateDir:             stateDir,
                created:              state.Created,
        }
        c.state = &loadedState{c: c}
        if err := c.refreshState(); err != nil {
                return nil, err
        }
        return c, nil
}

func loadState(root string) (*State, error) {
        stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
        if err != nil {
                return nil, err
        }
        f, err := os.Open(stateFilePath)
        if err != nil {
                if os.IsNotExist(err) {
                        return nil, ErrNotExist
                }
                return nil, err
        }
        defer f.Close()
        var state *State
        if err := json.NewDecoder(f).Decode(&state); err != nil {
                return nil, err
        }
        return state, nil
}

// validateID checks if the supplied container ID is valid, returning
// the ErrInvalidID in case it is not.
//
// The format of valid ID was never formally defined, instead the code
// was modified to allow or disallow specific characters.
//
// Currently, a valid ID is a non-empty string consisting only of
// the following characters:
// - uppercase (A-Z) and lowercase (a-z) Latin letters;
// - digits (0-9);
// - underscore (_);
// - plus sign (+);
// - minus sign (-);
// - period (.).
//
// In addition, IDs that can't be used to represent a file name
// (such as . or ..) are rejected.

func validateID(id string) error {
        if len(id) < 1 {
                return ErrInvalidID
        }

        // Allowed characters: 0-9 A-Z a-z _ + - .
        for i := 0; i < len(id); i++ {
                c := id[i]
                switch {
                case c >= 'a' && c <= 'z':
                case c >= 'A' && c <= 'Z':
                case c >= '0' && c <= '9':
                case c == '_':
                case c == '+':
                case c == '-':
                case c == '.':
                default:
                        return ErrInvalidID
                }

        }

        if string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
                return ErrInvalidID
        }

        return nil
}

package libcontainer

import (
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "os"
        "path/filepath"
        "runtime"
        "runtime/debug"
        "strconv"
        "syscall"

        "github.com/containerd/console"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/sirupsen/logrus"
        "github.com/vishvananda/netlink"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/capabilities"
        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/system"
        "github.com/opencontainers/runc/libcontainer/utils"
)

type initType string

const (
        initSetns    initType = "setns"
        initStandard initType = "standard"
)

type pid struct {
        Pid           int `json:"stage2_pid"`
        PidFirstChild int `json:"stage1_pid"`
}

// network is an internal struct used to setup container networks.
type network struct {
        configs.Network

        // TempVethPeerName is a unique temporary veth peer name that was placed into
        // the container's namespace.
        TempVethPeerName string `json:"temp_veth_peer_name"`
}

// initConfig is used for transferring parameters from Exec() to Init().
// It contains:
//   - original container config;
//   - some [Process] properties;
//   - set of properties merged from the container config ([configs.Config])
//     and the process ([Process]);
//   - some properties that come from the container.
//
// When adding new fields, please make sure they go into the relevant section.
type initConfig struct {
        // Config is the original container config.
        Config *configs.Config `json:"config"`

        // Properties that are unique to and come from [Process].

        Args             []string `json:"args"`
        Env              []string `json:"env"`
        UID              int      `json:"uid"`
        GID              int      `json:"gid"`
        AdditionalGroups []int    `json:"additional_groups"`
        Cwd              string   `json:"cwd"`
        CreateConsole    bool     `json:"create_console"`
        ConsoleWidth     uint16   `json:"console_width"`
        ConsoleHeight    uint16   `json:"console_height"`
        PassedFilesCount int      `json:"passed_files_count"`

        // Properties that exists both in the container config and the process,
        // as merged by [Container.newInitConfig] (process properties has preference).

        AppArmorProfile string                `json:"apparmor_profile"`
        Capabilities    *configs.Capabilities `json:"capabilities"`
        NoNewPrivileges bool                  `json:"no_new_privileges"`
        ProcessLabel    string                `json:"process_label"`
        Rlimits         []configs.Rlimit      `json:"rlimits"`
        IOPriority      *configs.IOPriority   `json:"io_priority,omitempty"`
        Scheduler       *configs.Scheduler    `json:"scheduler,omitempty"`

        // Miscellaneous properties, filled in by [Container.newInitConfig]
        // unless documented otherwise.

        ContainerID string `json:"containerid"`
        Cgroup2Path string `json:"cgroup2_path,omitempty"`

        // Networks is filled in from container config by [initProcess.createNetworkInterfaces].
        Networks []*network `json:"network"`

        // SpecState is filled in by [initProcess.Start].
        SpecState *specs.State `json:"spec_state,omitempty"`
}

// Init is part of "runc init" implementation.
func Init() {
        runtime.GOMAXPROCS(1)
        runtime.LockOSThread()

        if err := startInitialization(); err != nil {
                // If the error is returned, it was not communicated
                // back to the parent (which is not a common case),
                // so print it to stderr here as a last resort.
                //
                // Do not use logrus as we are not sure if it has been
                // set up yet, but most important, if the parent is
                // alive (and its log forwarding is working).
                fmt.Fprintln(os.Stderr, err)
        }
        // Normally, StartInitialization() never returns, meaning
        // if we are here, it had failed.
        os.Exit(255)
}

// Normally, this function does not return. If it returns, with or without an
// error, it means the initialization has failed. If the error is returned,
// it means the error can not be communicated back to the parent.
func startInitialization() (retErr error) {
        // Get the synchronisation pipe.
        envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
        syncPipeFd, err := strconv.Atoi(envSyncPipe)
        if err != nil {
                return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
        }
        syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
        defer syncPipe.Close()

        defer func() {
                // If this defer is ever called, this means initialization has failed.
                // Send the error back to the parent process in the form of an initError
                // if the sync socket has not been closed.
                if syncPipe.isClosed() {
                        return
                }
                ierr := initError{Message: retErr.Error()}
                if err := writeSyncArg(syncPipe, procError, ierr); err != nil {
                        fmt.Fprintln(os.Stderr, err)
                        return
                }
                // The error is sent, no need to also return it (or it will be reported twice).
                retErr = nil
        }()

        // Get the INITPIPE.
        envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
        initPipeFd, err := strconv.Atoi(envInitPipe)
        if err != nil {
                return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
        }
        initPipe := os.NewFile(uintptr(initPipeFd), "init")
        defer initPipe.Close()

        // Set up logging. This is used rarely, and mostly for init debugging.

        // Passing log level is optional; currently libcontainer/integration does not do it.
        if levelStr := os.Getenv("_LIBCONTAINER_LOGLEVEL"); levelStr != "" {
                logLevel, err := strconv.Atoi(levelStr)
                if err != nil {
                        return fmt.Errorf("unable to convert _LIBCONTAINER_LOGLEVEL: %w", err)
                }
                logrus.SetLevel(logrus.Level(logLevel))
        }

        logFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
        if err != nil {
                return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
        }
        logPipe := os.NewFile(uintptr(logFd), "logpipe")

        logrus.SetOutput(logPipe)
        logrus.SetFormatter(new(logrus.JSONFormatter))
        logrus.Debug("child process in init()")

        // Only init processes have FIFOFD.
        var fifoFile *os.File
        envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
        it := initType(envInitType)
        if it == initStandard {
                fifoFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_FIFOFD"))
                if err != nil {
                        return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
                }
                fifoFile = os.NewFile(uintptr(fifoFd), "initfifo")
        }

        var consoleSocket *os.File
        if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
                console, err := strconv.Atoi(envConsole)
                if err != nil {
                        return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
                }
                consoleSocket = os.NewFile(uintptr(console), "console-socket")
                defer consoleSocket.Close()
        }

        var pidfdSocket *os.File
        if envSockFd := os.Getenv("_LIBCONTAINER_PIDFD_SOCK"); envSockFd != "" {
                sockFd, err := strconv.Atoi(envSockFd)
                if err != nil {
                        return fmt.Errorf("unable to convert _LIBCONTAINER_PIDFD_SOCK: %w", err)
                }
                pidfdSocket = os.NewFile(uintptr(sockFd), "pidfd-socket")
                defer pidfdSocket.Close()
        }

        // From here on, we don't need current process environment. It is not
        // used directly anywhere below this point, but let's clear it anyway.
        os.Clearenv()

        defer func() {
                if err := recover(); err != nil {
                        if err2, ok := err.(error); ok {
                                retErr = fmt.Errorf("panic from initialization: %w, %s", err2, debug.Stack())
                        } else {
                                retErr = fmt.Errorf("panic from initialization: %v, %s", err, debug.Stack())
                        }
                }
        }()

        var config initConfig
        if err := json.NewDecoder(initPipe).Decode(&config); err != nil {
                return err
        }

        // If init succeeds, it will not return, hence none of the defers will be called.
        return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifoFile, logPipe)
}

func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket, fifoFile, logPipe *os.File) error {
        env, err := prepareEnv(config.Env, config.UID)
        if err != nil {
                return err
        }
        config.Env = env

        // Clean the RLIMIT_NOFILE cache in go runtime.
        // Issue: https://github.com/opencontainers/runc/issues/4195
        maybeClearRlimitNofileCache(config.Rlimits)

        switch t {
        case initSetns:
                i := &linuxSetnsInit{
                        pipe:          pipe,
                        consoleSocket: consoleSocket,
                        pidfdSocket:   pidfdSocket,
                        config:        config,
                        logPipe:       logPipe,
                }
                return i.Init()
        case initStandard:
                i := &linuxStandardInit{
                        pipe:          pipe,
                        consoleSocket: consoleSocket,
                        pidfdSocket:   pidfdSocket,
                        parentPid:     unix.Getppid(),
                        config:        config,
                        fifoFile:      fifoFile,
                        logPipe:       logPipe,
                }
                return i.Init()
        }
        return fmt.Errorf("unknown init type %q", t)
}

// verifyCwd ensures that the current directory is actually inside the mount
// namespace root of the current process.
func verifyCwd() error {
        // getcwd(2) on Linux detects if cwd is outside of the rootfs of the
        // current mount namespace root, and in that case prefixes "(unreachable)"
        // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
        // when this happens and return ENOENT rather than returning a non-absolute
        // path. In both cases we can therefore easily detect if we have an invalid
        // cwd by checking the return value of getcwd(3). See getcwd(3) for more
        // details, and CVE-2024-21626 for the security issue that motivated this
        // check.
        //
        // We have to use unix.Getwd() here because os.Getwd() has a workaround for
        // $PWD which involves doing stat(.), which can fail if the current
        // directory is inaccessible to the container process.
        if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
                return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
        } else if err != nil {
                return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
        } else if !filepath.IsAbs(wd) {
                // We shouldn't ever hit this, but check just in case.
                return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
        }
        return nil
}

// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace.
func finalizeNamespace(config *initConfig) error {
        // Ensure that all unwanted fds we may have accidentally
        // inherited are marked close-on-exec so they stay out of the
        // container
        if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
                return fmt.Errorf("error closing exec fds: %w", err)
        }

        // we only do chdir if it's specified
        doChdir := config.Cwd != ""
        if doChdir {
                // First, attempt the chdir before setting up the user.
                // This could allow us to access a directory that the user running runc can access
                // but the container user cannot.
                err := unix.Chdir(config.Cwd)
                switch {
                case err == nil:
                        doChdir = false
                case os.IsPermission(err):
                        // If we hit an EPERM, we should attempt again after setting up user.
                        // This will allow us to successfully chdir if the container user has access
                        // to the directory, but the user running runc does not.
                        // This is useful in cases where the cwd is also a volume that's been chowned to the container user.
                default:
                        return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
                }
        }

        w, err := capabilities.New(config.Capabilities)
        if err != nil {
                return err
        }
        // drop capabilities in bounding set before changing user
        if err := w.ApplyBoundingSet(); err != nil {
                return fmt.Errorf("unable to apply bounding set: %w", err)
        }
        // preserve existing capabilities while we change users
        if err := system.SetKeepCaps(); err != nil {
                return fmt.Errorf("unable to set keep caps: %w", err)
        }
        if err := setupUser(config); err != nil {
                return fmt.Errorf("unable to setup user: %w", err)
        }
        // Change working directory AFTER the user has been set up, if we haven't done it yet.
        if doChdir {
                if err := unix.Chdir(config.Cwd); err != nil {
                        return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
                }
        }
        // Make sure our final working directory is inside the container.
        if err := verifyCwd(); err != nil {
                return err
        }
        if err := system.ClearKeepCaps(); err != nil {
                return fmt.Errorf("unable to clear keep caps: %w", err)
        }
        if err := w.ApplyCaps(); err != nil {
                return fmt.Errorf("unable to apply caps: %w", err)
        }
        return nil
}

// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
        defer socket.Close()
        // At this point, /dev/ptmx points to something that we would expect. We
        // used to change the owner of the slave path, but since the /dev/pts mount
        // can have gid=X set (at the users' option). So touching the owner of the
        // slave PTY is not necessary, as the kernel will handle that for us. Note
        // however, that setupUser (specifically fixStdioPermissions) *will* change
        // the UID owner of the console to be the user the process will run as (so
        // they can actually control their console).

        pty, slavePath, err := console.NewPty()
        if err != nil {
                return err
        }
        // After we return from here, we don't need the console anymore.
        defer pty.Close()

        if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
                err = pty.Resize(console.WinSize{
                        Height: config.ConsoleHeight,
                        Width:  config.ConsoleWidth,
                })
                if err != nil {
                        return err
                }
        }

        // Mount the console inside our rootfs.
        if mount {
                if err := mountConsole(slavePath); err != nil {
                        return err
                }
        }
        // While we can access console.master, using the API is a good idea.
        if err := utils.SendRawFd(socket, pty.Name(), pty.Fd()); err != nil {
                return err
        }
        runtime.KeepAlive(pty)

        // Now, dup over all the things.
        return dupStdio(slavePath)
}

// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe *syncSocket) error {
        // Tell parent.
        if err := writeSync(pipe, procReady); err != nil {
                return err
        }
        // Wait for parent to give the all-clear.
        return readSync(pipe, procRun)
}

// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe *syncSocket) error {
        // Tell parent.
        if err := writeSync(pipe, procHooks); err != nil {
                return err
        }
        // Wait for parent to give the all-clear.
        return readSync(pipe, procHooksDone)
}

// syncParentSeccomp sends the fd associated with the seccomp file descriptor
// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
func syncParentSeccomp(pipe *syncSocket, seccompFd int) error {
        if seccompFd == -1 {
                return nil
        }
        defer unix.Close(seccompFd)

        // Tell parent to grab our fd.
        //
        // Notably, we do not use writeSyncFile here because a container might have
        // an SCMP_ACT_NOTIFY action on sendmsg(2) so we need to use the smallest
        // possible number of system calls here because all of those syscalls
        // cannot be used with SCMP_ACT_NOTIFY as a result (any syscall we use here
        // before the parent gets the file descriptor would deadlock "runc init" if
        // we allowed it for SCMP_ACT_NOTIFY). See seccomp.InitSeccomp() for more
        // details.
        if err := writeSyncArg(pipe, procSeccomp, seccompFd); err != nil {
                return err
        }
        // Wait for parent to tell us they've grabbed the seccompfd.
        return readSync(pipe, procSeccompDone)
}

// setupUser changes the groups, gid, and uid for the user inside the container.
func setupUser(config *initConfig) error {
        // Before we change to the container's user make sure that the processes
        // STDIO is correctly owned by the user that we are switching to.
        if err := fixStdioPermissions(config.UID); err != nil {
                return err
        }

        // We don't need to use /proc/thread-self here because setgroups is a
        // per-userns file and thus is global to all threads in a thread-group.
        // This lets us avoid having to do runtime.LockOSThread.
        setgroups, err := os.ReadFile("/proc/self/setgroups")
        if err != nil && !os.IsNotExist(err) {
                return err
        }

        // This isn't allowed in an unprivileged user namespace since Linux 3.19.
        // There's nothing we can do about /etc/group entries, so we silently
        // ignore setting groups here (since the user didn't explicitly ask us to
        // set the group).
        allowSupGroups := !config.Config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"

        if allowSupGroups {
                if err := unix.Setgroups(config.AdditionalGroups); err != nil {
                        return &os.SyscallError{Syscall: "setgroups", Err: err}
                }
        }

        if err := unix.Setgid(config.GID); err != nil {
                if err == unix.EINVAL {
                        return fmt.Errorf("cannot setgid to unmapped gid %d in user namespace", config.GID)
                }
                return err
        }
        if err := unix.Setuid(config.UID); err != nil {
                if err == unix.EINVAL {
                        return fmt.Errorf("cannot setuid to unmapped uid %d in user namespace", config.UID)
                }
                return err
        }
        return nil
}

// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified uid.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
func fixStdioPermissions(uid int) error {
        var null unix.Stat_t
        if err := unix.Stat("/dev/null", &null); err != nil {
                return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
        }
        for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
                var s unix.Stat_t
                if err := unix.Fstat(int(file.Fd()), &s); err != nil {
                        return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
                }

                // Skip chown if uid is already the one we want or any of the STDIO descriptors
                // were redirected to /dev/null.
                if int(s.Uid) == uid || s.Rdev == null.Rdev {
                        continue
                }

                // We only change the uid (as it is possible for the mount to
                // prefer a different gid, and there's no reason for us to change it).
                // The reason why we don't just leave the default uid=X mount setup is
                // that users expect to be able to actually use their console. Without
                // this code, you couldn't effectively run as a non-root user inside a
                // container and also have a console set up.
                if err := file.Chown(uid, int(s.Gid)); err != nil {
                        // If we've hit an EINVAL then s.Gid isn't mapped in the user
                        // namespace. If we've hit an EPERM then the inode's current owner
                        // is not mapped in our user namespace (in particular,
                        // privileged_wrt_inode_uidgid() has failed). Read-only
                        // /dev can result in EROFS error. In any case, it's
                        // better for us to just not touch the stdio rather
                        // than bail at this point.

                        if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
                                continue
                        }
                        return err
                }
        }
        return nil
}

// setupNetwork sets up and initializes any network interface inside the container.
func setupNetwork(config *initConfig) error {
        for _, config := range config.Networks {
                strategy, err := getStrategy(config.Type)
                if err != nil {
                        return err
                }
                if err := strategy.initialize(config); err != nil {
                        return err
                }
        }
        return nil
}

func setupRoute(config *configs.Config) error {
        for _, config := range config.Routes {
                _, dst, err := net.ParseCIDR(config.Destination)
                if err != nil {
                        return err
                }
                src := net.ParseIP(config.Source)
                if src == nil {
                        return fmt.Errorf("Invalid source for route: %s", config.Source)
                }
                gw := net.ParseIP(config.Gateway)
                if gw == nil {
                        return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
                }
                l, err := netlink.LinkByName(config.InterfaceName)
                if err != nil {
                        return err
                }
                route := &netlink.Route{
                        Scope:     netlink.SCOPE_UNIVERSE,
                        Dst:       dst,
                        Src:       src,
                        Gw:        gw,
                        LinkIndex: l.Attrs().Index,
                }
                if err := netlink.RouteAdd(route); err != nil {
                        return err
                }
        }
        return nil
}

func maybeClearRlimitNofileCache(limits []configs.Rlimit) {
        for _, rlimit := range limits {
                if rlimit.Type == syscall.RLIMIT_NOFILE {
                        system.ClearRlimitNofileCache(&syscall.Rlimit{
                                Cur: rlimit.Soft,
                                Max: rlimit.Hard,
                        })
                        return
                }
        }
}

func setupRlimits(limits []configs.Rlimit, pid int) error {
        for _, rlimit := range limits {
                if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
                        return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
                }
        }
        return nil
}

func setupScheduler(config *initConfig) error {
        if config.Scheduler == nil {
                return nil
        }
        attr, err := configs.ToSchedAttr(config.Scheduler)
        if err != nil {
                return err
        }
        if err := unix.SchedSetAttr(0, attr, 0); err != nil {
                if errors.Is(err, unix.EPERM) && config.Config.Cgroups.CpusetCpus != "" {
                        return errors.New("process scheduler can't be used together with AllowedCPUs")
                }
                return fmt.Errorf("error setting scheduler: %w", err)
        }
        return nil
}

func setupIOPriority(config *initConfig) error {
        const ioprioWhoPgrp = 1

        ioprio := config.IOPriority
        if ioprio == nil {
                return nil
        }
        class := 0
        switch ioprio.Class {
        case specs.IOPRIO_CLASS_RT:
                class = 1
        case specs.IOPRIO_CLASS_BE:
                class = 2
        case specs.IOPRIO_CLASS_IDLE:
                class = 3
        default:
                return fmt.Errorf("invalid io priority class: %s", ioprio.Class)
        }

        // Combine class and priority into a single value
        // https://github.com/torvalds/linux/blob/v5.18/include/uapi/linux/ioprio.h#L5-L17
        iop := (class << 13) | ioprio.Priority
        _, _, errno := unix.RawSyscall(unix.SYS_IOPRIO_SET, ioprioWhoPgrp, 0, uintptr(iop))
        if errno != 0 {
                return fmt.Errorf("failed to set io priority: %w", errno)
        }
        return nil
}

func setupPersonality(config *configs.Config) error {
        return system.SetLinuxPersonality(config.Personality.Domain)
}

// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
func signalAllProcesses(m cgroups.Manager, s unix.Signal) error {
        if !m.Exists() {
                return ErrCgroupNotExist
        }
        // Use cgroup.kill, if available.
        if s == unix.SIGKILL {
                if p := m.Path(""); p != "" { // Either cgroup v2 or hybrid.
                        err := cgroups.WriteFile(p, "cgroup.kill", "1")
                        if err == nil || !errors.Is(err, os.ErrNotExist) {
                                return err
                        }
                        // Fallback to old implementation.
                }
        }

        if err := m.Freeze(cgroups.Frozen); err != nil {
                logrus.Warn(err)
        }
        pids, err := m.GetAllPids()
        if err != nil {
                if err := m.Freeze(cgroups.Thawed); err != nil {
                        logrus.Warn(err)
                }
                return err
        }
        for _, pid := range pids {
                err := unix.Kill(pid, s)
                if err != nil && err != unix.ESRCH {
                        logrus.Warnf("kill %d: %v", pid, err)
                }
        }
        if err := m.Freeze(cgroups.Thawed); err != nil {
                logrus.Warn(err)
        }

        return nil
}

// setupPidfd opens a process file descriptor of init process, and sends the
// file descriptor back to the socket.
func setupPidfd(socket *os.File, initType string) error {
        defer socket.Close()

        pidFd, err := unix.PidfdOpen(os.Getpid(), 0)
        if err != nil {
                return fmt.Errorf("failed to pidfd_open: %w", err)
        }

        if err := utils.SendRawFd(socket, initType, uintptr(pidFd)); err != nil {
                unix.Close(pidFd)
                return fmt.Errorf("failed to send pidfd on socket: %w", err)
        }
        return unix.Close(pidFd)
}

package intelrdt

var cmtEnabled bool

// Check if Intel RDT/CMT is enabled.
func IsCMTEnabled() bool {
        featuresInit()
        return cmtEnabled
}

func getCMTNumaNodeStats(numaPath string) (*CMTNumaNodeStats, error) {
        stats := &CMTNumaNodeStats{}

        if enabledMonFeatures.llcOccupancy {
                llcOccupancy, err := getIntelRdtParamUint(numaPath, "llc_occupancy")
                if err != nil {
                        return nil, err
                }
                stats.LLCOccupancy = llcOccupancy
        }

        return stats, nil
}

package intelrdt

import (
        "bytes"
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"

        "github.com/moby/sys/mountinfo"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
        "github.com/opencontainers/runc/libcontainer/configs"
)

/*
 * About Intel RDT features:
 * Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
 * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
 * two sub-features of RDT.
 *
 * Cache Allocation Technology (CAT) provides a way for the software to restrict
 * cache allocation to a defined 'subset' of L3 cache which may be overlapping
 * with other 'subsets'. The different subsets are identified by class of
 * service (CLOS) and each CLOS has a capacity bitmask (CBM).
 *
 * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
 * over memory bandwidth for the software. A user controls the resource by
 * indicating the percentage of maximum memory bandwidth or memory bandwidth
 * limit in MBps unit if MBA Software Controller is enabled.
 *
 * More details about Intel RDT CAT and MBA can be found in the section 17.18
 * of Intel Software Developer Manual:
 * https://software.intel.com/en-us/articles/intel-sdm
 *
 * About Intel RDT kernel interface:
 * In Linux 4.10 kernel or newer, the interface is defined and exposed via
 * "resource control" filesystem, which is a "cgroup-like" interface.
 *
 * Comparing with cgroups, it has similar process management lifecycle and
 * interfaces in a container. But unlike cgroups' hierarchy, it has single level
 * filesystem layout.
 *
 * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
 * "resource control" filesystem.
 *
 * Intel RDT "resource control" filesystem hierarchy:
 * mount -t resctrl resctrl /sys/fs/resctrl
 * tree /sys/fs/resctrl
 * /sys/fs/resctrl/
 * |-- info
 * |   |-- L3
 * |   |   |-- cbm_mask
 * |   |   |-- min_cbm_bits
 * |   |   |-- num_closids
 * |   |-- L3_MON
 * |   |   |-- max_threshold_occupancy
 * |   |   |-- mon_features
 * |   |   |-- num_rmids
 * |   |-- MB
 * |       |-- bandwidth_gran
 * |       |-- delay_linear
 * |       |-- min_bandwidth
 * |       |-- num_closids
 * |-- ...
 * |-- schemata
 * |-- tasks
 * |-- <clos>
 *     |-- ...
 *     |-- schemata
 *     |-- tasks
 *
 * For runc, we can make use of `tasks` and `schemata` configuration for L3
 * cache and memory bandwidth resources constraints.
 *
 * The file `tasks` has a list of tasks that belongs to this group (e.g.,
 * <container_id>" group). Tasks can be added to a group by writing the task ID
 * to the "tasks" file (which will automatically remove them from the previous
 * group to which they belonged). New tasks created by fork(2) and clone(2) are
 * added to the same group as their parent.
 *
 * The file `schemata` has a list of all the resources available to this group.
 * Each resource (L3 cache, memory bandwidth) has its own line and format.
 *
 * L3 cache schema:
 * It has allocation bitmasks/values for L3 cache on each socket, which
 * contains L3 cache id and capacity bitmask (CBM).
 *         Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
 * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
 * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
 *
 * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
 * be set is less than the max bit. The max bits in the CBM is varied among
 * supported Intel CPU models. Kernel will check if it is valid when writing.
 * e.g., default value 0xfffff in root indicates the max bits of CBM is 20
 * bits, which mapping to entire L3 cache capacity. Some valid CBM values to
 * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
 *
 * Memory bandwidth schema:
 * It has allocation values for memory bandwidth on each socket, which contains
 * L3 cache id and memory bandwidth.
 *         Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
 * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
 *
 * The minimum bandwidth percentage value for each CPU model is predefined and
 * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
 * that is allocated is also dependent on the CPU model and can be looked up at
 * "info/MB/bandwidth_gran". The available bandwidth control steps are:
 * min_bw + N * bw_gran. Intermediate values are rounded to the next control
 * step available on the hardware.
 *
 * If MBA Software Controller is enabled through mount option "-o mba_MBps":
 * mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
 * We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit
 * instead of "percentages". The kernel underneath would use a software feedback
 * mechanism or a "Software Controller" which reads the actual bandwidth using
 * MBM counters and adjust the memory bandwidth percentages to ensure:
 * "actual memory bandwidth < user specified memory bandwidth".
 *
 * For example, on a two-socket machine, the schema line could be
 * "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0
 * and 7000 MBps memory bandwidth limit on socket 1.
 *
 * For more information about Intel RDT kernel interface:
 * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
 *
 * An example for runc:
 * Consider a two-socket machine with two L3 caches where the default CBM is
 * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
 * with a memory bandwidth granularity of 10%.
 *
 * Tasks inside the container only have access to the "upper" 7/11 of L3 cache
 * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
 * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
 *
 * "linux": {
 *     "intelRdt": {
 *         "l3CacheSchema": "L3:0=7f0;1=1f",
 *         "memBwSchema": "MB:0=20;1=70"
 *         }
 * }
 */

type Manager struct {
        mu     sync.Mutex
        config *configs.Config
        id     string
        path   string
}

// NewManager returns a new instance of Manager, or nil if the Intel RDT
// functionality is not specified in the config, available from hardware or
// enabled in the kernel.
func NewManager(config *configs.Config, id string, path string) *Manager {
        if config.IntelRdt == nil {
                return nil
        }
        if _, err := Root(); err != nil {
                // Intel RDT is not available.
                return nil
        }
        return newManager(config, id, path)
}

// newManager is the same as NewManager, except it does not check if the feature
// is actually available. Used by unit tests that mock intelrdt paths.
func newManager(config *configs.Config, id string, path string) *Manager {
        return &Manager{
                config: config,
                id:     id,
                path:   path,
        }
}

const (
        intelRdtTasks = "tasks"
)

var (
        // The flag to indicate if Intel RDT/CAT is enabled
        catEnabled bool
        // The flag to indicate if Intel RDT/MBA is enabled
        mbaEnabled bool

        // For Intel RDT initialization
        initOnce sync.Once

        errNotFound = errors.New("Intel RDT not available")
)

// Check if Intel RDT sub-features are enabled in featuresInit()
func featuresInit() {
        initOnce.Do(func() {
                // 1. Check if Intel RDT "resource control" filesystem is available.
                // The user guarantees to mount the filesystem.
                root, err := Root()
                if err != nil {
                        return
                }

                // 2. Check if Intel RDT sub-features are available in "resource
                // control" filesystem. Intel RDT sub-features can be
                // selectively disabled or enabled by kernel command line
                // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
                if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
                        catEnabled = true
                }
                if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
                        mbaEnabled = true
                }
                if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
                        return
                }
                enabledMonFeatures, err = getMonFeatures(root)
                if err != nil {
                        return
                }
                if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
                        mbmEnabled = true
                }
                if enabledMonFeatures.llcOccupancy {
                        cmtEnabled = true
                }
        })
}

// findIntelRdtMountpointDir returns the mount point of the Intel RDT "resource control" filesystem.
func findIntelRdtMountpointDir() (string, error) {
        mi, err := mountinfo.GetMounts(func(m *mountinfo.Info) (bool, bool) {
                // similar to mountinfo.FSTypeFilter but stops after the first match
                if m.FSType == "resctrl" {
                        return false, true // don't skip, stop
                }
                return true, false // skip, keep going
        })
        if err != nil {
                return "", err
        }
        if len(mi) < 1 {
                return "", errNotFound
        }

        return mi[0].Mountpoint, nil
}

// For Root() use only.
var (
        intelRdtRoot    string
        intelRdtRootErr error
        rootOnce        sync.Once
)

// The kernel creates this (empty) directory if resctrl is supported by the
// hardware and kernel. The user is responsible for mounting the resctrl
// filesystem, and they could mount it somewhere else if they wanted to.
const defaultResctrlMountpoint = "/sys/fs/resctrl"

// Root returns the Intel RDT "resource control" filesystem mount point.
func Root() (string, error) {
        rootOnce.Do(func() {
                // Does this system support resctrl?
                var statfs unix.Statfs_t
                if err := unix.Statfs(defaultResctrlMountpoint, &statfs); err != nil {
                        if errors.Is(err, unix.ENOENT) {
                                err = errNotFound
                        }
                        intelRdtRootErr = err
                        return
                }

                // Has the resctrl fs been mounted to the default mount point?
                if statfs.Type == unix.RDTGROUP_SUPER_MAGIC {
                        intelRdtRoot = defaultResctrlMountpoint
                        return
                }

                // The resctrl fs could have been mounted somewhere nonstandard.
                intelRdtRoot, intelRdtRootErr = findIntelRdtMountpointDir()
        })

        return intelRdtRoot, intelRdtRootErr
}

// Gets a single uint64 value from the specified file.
func getIntelRdtParamUint(path, file string) (uint64, error) {
        fileName := filepath.Join(path, file)
        contents, err := os.ReadFile(fileName)
        if err != nil {
                return 0, err
        }

        res, err := fscommon.ParseUint(string(bytes.TrimSpace(contents)), 10, 64)
        if err != nil {
                return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName)
        }
        return res, nil
}

// Gets a string value from the specified file
func getIntelRdtParamString(path, file string) (string, error) {
        contents, err := os.ReadFile(filepath.Join(path, file))
        if err != nil {
                return "", err
        }

        return string(bytes.TrimSpace(contents)), nil
}

func writeFile(dir, file, data string) error {
        if dir == "" {
                return fmt.Errorf("no such directory for %s", file)
        }
        if err := os.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0o600); err != nil {
                return newLastCmdError(fmt.Errorf("intelrdt: unable to write %v: %w", data, err))
        }
        return nil
}

// Get the read-only L3 cache information
func getL3CacheInfo() (*L3CacheInfo, error) {
        l3CacheInfo := &L3CacheInfo{}

        rootPath, err := Root()
        if err != nil {
                return l3CacheInfo, err
        }

        path := filepath.Join(rootPath, "info", "L3")
        cbmMask, err := getIntelRdtParamString(path, "cbm_mask")
        if err != nil {
                return l3CacheInfo, err
        }
        minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits")
        if err != nil {
                return l3CacheInfo, err
        }
        numClosids, err := getIntelRdtParamUint(path, "num_closids")
        if err != nil {
                return l3CacheInfo, err
        }

        l3CacheInfo.CbmMask = cbmMask
        l3CacheInfo.MinCbmBits = minCbmBits
        l3CacheInfo.NumClosids = numClosids

        return l3CacheInfo, nil
}

// Get the read-only memory bandwidth information
func getMemBwInfo() (*MemBwInfo, error) {
        memBwInfo := &MemBwInfo{}

        rootPath, err := Root()
        if err != nil {
                return memBwInfo, err
        }

        path := filepath.Join(rootPath, "info", "MB")
        bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran")
        if err != nil {
                return memBwInfo, err
        }
        delayLinear, err := getIntelRdtParamUint(path, "delay_linear")
        if err != nil {
                return memBwInfo, err
        }
        minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth")
        if err != nil {
                return memBwInfo, err
        }
        numClosids, err := getIntelRdtParamUint(path, "num_closids")
        if err != nil {
                return memBwInfo, err
        }

        memBwInfo.BandwidthGran = bandwidthGran
        memBwInfo.DelayLinear = delayLinear
        memBwInfo.MinBandwidth = minBandwidth
        memBwInfo.NumClosids = numClosids

        return memBwInfo, nil
}

// Get diagnostics for last filesystem operation error from file info/last_cmd_status
func getLastCmdStatus() (string, error) {
        rootPath, err := Root()
        if err != nil {
                return "", err
        }

        path := filepath.Join(rootPath, "info")
        lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status")
        if err != nil {
                return "", err
        }

        return lastCmdStatus, nil
}

// WriteIntelRdtTasks writes the specified pid into the "tasks" file
func WriteIntelRdtTasks(dir string, pid int) error {
        if dir == "" {
                return fmt.Errorf("no such directory for %s", intelRdtTasks)
        }

        // Don't attach any pid if -1 is specified as a pid
        if pid != -1 {
                if err := os.WriteFile(filepath.Join(dir, intelRdtTasks), []byte(strconv.Itoa(pid)), 0o600); err != nil {
                        return newLastCmdError(fmt.Errorf("intelrdt: unable to add pid %d: %w", pid, err))
                }
        }
        return nil
}

// Check if Intel RDT/CAT is enabled
func IsCATEnabled() bool {
        featuresInit()
        return catEnabled
}

// Check if Intel RDT/MBA is enabled
func IsMBAEnabled() bool {
        featuresInit()
        return mbaEnabled
}

// Get the path of the clos group in "resource control" filesystem that the container belongs to
func (m *Manager) getIntelRdtPath() (string, error) {
        rootPath, err := Root()
        if err != nil {
                return "", err
        }

        clos := m.id
        if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID != "" {
                clos = m.config.IntelRdt.ClosID
        }

        return filepath.Join(rootPath, clos), nil
}

// Applies Intel RDT configuration to the process with the specified pid
func (m *Manager) Apply(pid int) (err error) {
        // If intelRdt is not specified in config, we do nothing
        if m.config.IntelRdt == nil {
                return nil
        }

        path, err := m.getIntelRdtPath()
        if err != nil {
                return err
        }

        m.mu.Lock()
        defer m.mu.Unlock()

        if m.config.IntelRdt.ClosID != "" && m.config.IntelRdt.L3CacheSchema == "" && m.config.IntelRdt.MemBwSchema == "" {
                // Check that the CLOS exists, i.e. it has been pre-configured to
                // conform with the runtime spec
                if _, err := os.Stat(path); err != nil {
                        return fmt.Errorf("clos dir not accessible (must be pre-created when l3CacheSchema and memBwSchema are empty): %w", err)
                }
        }

        if err := os.MkdirAll(path, 0o755); err != nil {
                return newLastCmdError(err)
        }

        if err := WriteIntelRdtTasks(path, pid); err != nil {
                return newLastCmdError(err)
        }

        m.path = path
        return nil
}

// Destroys the Intel RDT container-specific 'container_id' group
func (m *Manager) Destroy() error {
        // Don't remove resctrl group if closid has been explicitly specified. The
        // group is likely externally managed, i.e. by some other entity than us.
        // There are probably other containers/tasks sharing the same group.
        if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID == "" {
                m.mu.Lock()
                defer m.mu.Unlock()
                if err := os.RemoveAll(m.GetPath()); err != nil {
                        return err
                }
                m.path = ""
        }
        return nil
}

// Returns Intel RDT path to save in a state file and to be able to
// restore the object later
func (m *Manager) GetPath() string {
        if m.path == "" {
                m.path, _ = m.getIntelRdtPath()
        }
        return m.path
}

// Returns statistics for Intel RDT
func (m *Manager) GetStats() (*Stats, error) {
        // If intelRdt is not specified in config
        if m.config.IntelRdt == nil {
                return nil, nil
        }

        m.mu.Lock()
        defer m.mu.Unlock()
        stats := newStats()

        rootPath, err := Root()
        if err != nil {
                return nil, err
        }
        // The read-only L3 cache and memory bandwidth schemata in root
        tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
        if err != nil {
                return nil, err
        }
        schemaRootStrings := strings.Split(tmpRootStrings, "\n")

        // The L3 cache and memory bandwidth schemata in container's clos group
        containerPath := m.GetPath()
        tmpStrings, err := getIntelRdtParamString(containerPath, "schemata")
        if err != nil {
                return nil, err
        }
        schemaStrings := strings.Split(tmpStrings, "\n")

        if IsCATEnabled() {
                // The read-only L3 cache information
                l3CacheInfo, err := getL3CacheInfo()
                if err != nil {
                        return nil, err
                }
                stats.L3CacheInfo = l3CacheInfo

                // The read-only L3 cache schema in root
                for _, schemaRoot := range schemaRootStrings {
                        if strings.Contains(schemaRoot, "L3") {
                                stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot)
                        }
                }

                // The L3 cache schema in container's clos group
                for _, schema := range schemaStrings {
                        if strings.Contains(schema, "L3") {
                                stats.L3CacheSchema = strings.TrimSpace(schema)
                        }
                }
        }

        if IsMBAEnabled() {
                // The read-only memory bandwidth information
                memBwInfo, err := getMemBwInfo()
                if err != nil {
                        return nil, err
                }
                stats.MemBwInfo = memBwInfo

                // The read-only memory bandwidth information
                for _, schemaRoot := range schemaRootStrings {
                        if strings.Contains(schemaRoot, "MB") {
                                stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot)
                        }
                }

                // The memory bandwidth schema in container's clos group
                for _, schema := range schemaStrings {
                        if strings.Contains(schema, "MB") {
                                stats.MemBwSchema = strings.TrimSpace(schema)
                        }
                }
        }

        if IsMBMEnabled() || IsCMTEnabled() {
                err = getMonitoringStats(containerPath, stats)
                if err != nil {
                        return nil, err
                }
        }

        return stats, nil
}

// Set Intel RDT "resource control" filesystem as configured.
func (m *Manager) Set(container *configs.Config) error {
        // About L3 cache schema:
        // It has allocation bitmasks/values for L3 cache on each socket,
        // which contains L3 cache id and capacity bitmask (CBM).
        //         Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
        // For example, on a two-socket machine, the schema line could be:
        //         L3:0=ff;1=c0
        // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM
        // is 0xc0.
        //
        // The valid L3 cache CBM is a *contiguous bits set* and number of
        // bits that can be set is less than the max bit. The max bits in the
        // CBM is varied among supported Intel CPU models. Kernel will check
        // if it is valid when writing. e.g., default value 0xfffff in root
        // indicates the max bits of CBM is 20 bits, which mapping to entire
        // L3 cache capacity. Some valid CBM values to set in a group:
        // 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
        //
        //
        // About memory bandwidth schema:
        // It has allocation values for memory bandwidth on each socket, which
        // contains L3 cache id and memory bandwidth.
        //         Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
        // For example, on a two-socket machine, the schema line could be:
        //         "MB:0=20;1=70"
        //
        // The minimum bandwidth percentage value for each CPU model is
        // predefined and can be looked up through "info/MB/min_bandwidth".
        // The bandwidth granularity that is allocated is also dependent on
        // the CPU model and can be looked up at "info/MB/bandwidth_gran".
        // The available bandwidth control steps are: min_bw + N * bw_gran.
        // Intermediate values are rounded to the next control step available
        // on the hardware.
        //
        // If MBA Software Controller is enabled through mount option
        // "-o mba_MBps": mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
        // We could specify memory bandwidth in "MBps" (Mega Bytes per second)
        // unit instead of "percentages". The kernel underneath would use a
        // software feedback mechanism or a "Software Controller" which reads
        // the actual bandwidth using MBM counters and adjust the memory
        // bandwidth percentages to ensure:
        // "actual memory bandwidth < user specified memory bandwidth".
        //
        // For example, on a two-socket machine, the schema line could be
        // "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on
        // socket 0 and 7000 MBps memory bandwidth limit on socket 1.
        if container.IntelRdt != nil {
                path := m.GetPath()
                l3CacheSchema := container.IntelRdt.L3CacheSchema
                memBwSchema := container.IntelRdt.MemBwSchema

                // TODO: verify that l3CacheSchema and/or memBwSchema match the
                // existing schemata if ClosID has been specified. This is a more
                // involved than reading the file and doing plain string comparison as
                // the value written in does not necessarily match what gets read out
                // (leading zeros, cache id ordering etc).

                // Write a single joint schema string to schemata file
                if l3CacheSchema != "" && memBwSchema != "" {
                        if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil {
                                return err
                        }
                }

                // Write only L3 cache schema string to schemata file
                if l3CacheSchema != "" && memBwSchema == "" {
                        if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
                                return err
                        }
                }

                // Write only memory bandwidth schema string to schemata file
                if l3CacheSchema == "" && memBwSchema != "" {
                        if err := writeFile(path, "schemata", memBwSchema); err != nil {
                                return err
                        }
                }
        }

        return nil
}

func newLastCmdError(err error) error {
        status, err1 := getLastCmdStatus()
        if err1 == nil {
                return fmt.Errorf("%w, last_cmd_status: %s", err, status)
        }
        return err
}

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package intelrdt

import (
        "strings"
        "testing"
)

func init() {
        testing.Init()
}

func FuzzParseMonFeatures(data []byte) int {
        _, _ = parseMonFeatures(
                strings.NewReader(string(data)))
        return 1
}

func FuzzSetCacheScema(data []byte) int {
        t := &testing.T{}
        if (len(data) % 2) != 0 {
                return -1
        }
        halfLen := len(data) / 2
        firstHalf := data[:halfLen]
        secondHalf := data[halfLen:]

        helper := NewIntelRdtTestUtil(t)

        l3CacheSchemaBefore := string(firstHalf)
        l3CacheSchemeAfter := string(secondHalf)

        helper.writeFileContents(map[string]string{
                "schemata": l3CacheSchemaBefore + "\n",
        })

        helper.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter
        intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
        intelrdt.Set(helper.config)

        return 1
}

package intelrdt

// The flag to indicate if Intel RDT/MBM is enabled
var mbmEnabled bool

// Check if Intel RDT/MBM is enabled.
func IsMBMEnabled() bool {
        featuresInit()
        return mbmEnabled
}

func getMBMNumaNodeStats(numaPath string) (*MBMNumaNodeStats, error) {
        stats := &MBMNumaNodeStats{}
        if enabledMonFeatures.mbmTotalBytes {
                mbmTotalBytes, err := getIntelRdtParamUint(numaPath, "mbm_total_bytes")
                if err != nil {
                        return nil, err
                }
                stats.MBMTotalBytes = mbmTotalBytes
        }

        if enabledMonFeatures.mbmLocalBytes {
                mbmLocalBytes, err := getIntelRdtParamUint(numaPath, "mbm_local_bytes")
                if err != nil {
                        return nil, err
                }
                stats.MBMLocalBytes = mbmLocalBytes
        }

        return stats, nil
}

package intelrdt

import (
        "bufio"
        "io"
        "os"
        "path/filepath"

        "github.com/sirupsen/logrus"
)

var enabledMonFeatures monFeatures

type monFeatures struct {
        mbmTotalBytes bool
        mbmLocalBytes bool
        llcOccupancy  bool
}

func getMonFeatures(intelRdtRoot string) (monFeatures, error) {
        file, err := os.Open(filepath.Join(intelRdtRoot, "info", "L3_MON", "mon_features"))
        if err != nil {
                return monFeatures{}, err
        }
        defer file.Close()
        return parseMonFeatures(file)
}

func parseMonFeatures(reader io.Reader) (monFeatures, error) {
        scanner := bufio.NewScanner(reader)

        monFeatures := monFeatures{}

        for scanner.Scan() {
                switch feature := scanner.Text(); feature {
                case "mbm_total_bytes":
                        monFeatures.mbmTotalBytes = true
                case "mbm_local_bytes":
                        monFeatures.mbmLocalBytes = true
                case "llc_occupancy":
                        monFeatures.llcOccupancy = true
                default:
                        logrus.Warnf("Unsupported Intel RDT monitoring feature: %s", feature)
                }
        }

        return monFeatures, scanner.Err()
}

func getMonitoringStats(containerPath string, stats *Stats) error {
        numaFiles, err := os.ReadDir(filepath.Join(containerPath, "mon_data"))
        if err != nil {
                return err
        }

        var mbmStats []MBMNumaNodeStats
        var cmtStats []CMTNumaNodeStats

        for _, file := range numaFiles {
                if file.IsDir() {
                        numaPath := filepath.Join(containerPath, "mon_data", file.Name())
                        if IsMBMEnabled() {
                                numaMBMStats, err := getMBMNumaNodeStats(numaPath)
                                if err != nil {
                                        return err
                                }
                                mbmStats = append(mbmStats, *numaMBMStats)
                        }
                        if IsCMTEnabled() {
                                numaCMTStats, err := getCMTNumaNodeStats(numaPath)
                                if err != nil {
                                        return err
                                }
                                cmtStats = append(cmtStats, *numaCMTStats)
                        }
                }
        }

        stats.MBMStats = &mbmStats
        stats.CMTStats = &cmtStats

        return err
}

package intelrdt

type L3CacheInfo struct {
        CbmMask    string `json:"cbm_mask,omitempty"`
        MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
        NumClosids uint64 `json:"num_closids,omitempty"`
}

type MemBwInfo struct {
        BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
        DelayLinear   uint64 `json:"delay_linear,omitempty"`
        MinBandwidth  uint64 `json:"min_bandwidth,omitempty"`
        NumClosids    uint64 `json:"num_closids,omitempty"`
}

type MBMNumaNodeStats struct {
        // The 'mbm_total_bytes' in 'container_id' group.
        MBMTotalBytes uint64 `json:"mbm_total_bytes"`

        // The 'mbm_local_bytes' in 'container_id' group.
        MBMLocalBytes uint64 `json:"mbm_local_bytes"`
}

type CMTNumaNodeStats struct {
        // The 'llc_occupancy' in 'container_id' group.
        LLCOccupancy uint64 `json:"llc_occupancy"`
}

type Stats struct {
        // The read-only L3 cache information
        L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`

        // The read-only L3 cache schema in root
        L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`

        // The L3 cache schema in 'container_id' group
        L3CacheSchema string `json:"l3_cache_schema,omitempty"`

        // The read-only memory bandwidth information
        MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`

        // The read-only memory bandwidth schema in root
        MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`

        // The memory bandwidth schema in 'container_id' group
        MemBwSchema string `json:"mem_bw_schema,omitempty"`

        // The memory bandwidth monitoring statistics from NUMA nodes in 'container_id' group
        MBMStats *[]MBMNumaNodeStats `json:"mbm_stats,omitempty"`

        // The cache monitoring technology statistics from NUMA nodes in 'container_id' group
        CMTStats *[]CMTNumaNodeStats `json:"cmt_stats,omitempty"`
}

func newStats() *Stats {
        return &Stats{}
}

/*
 * Utility for testing Intel RDT operations.
 * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test.
 */
package intelrdt

import (
        "os"
        "path/filepath"
        "testing"

        "github.com/opencontainers/runc/libcontainer/configs"
)

type intelRdtTestUtil struct {
        config *configs.Config

        // Path to the mock Intel RDT "resource control" filesystem directory
        IntelRdtPath string

        t *testing.T
}

// Creates a new test util
func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil {
        config := &configs.Config{
                IntelRdt: &configs.IntelRdt{},
        }

        // Assign fake intelRtdRoot value, returned by Root().
        intelRdtRoot = t.TempDir()
        // Make sure Root() won't even try to parse mountinfo.
        rootOnce.Do(func() {})

        testIntelRdtPath := filepath.Join(intelRdtRoot, "resctrl")

        // Ensure the full mock Intel RDT "resource control" filesystem path exists
        if err := os.MkdirAll(testIntelRdtPath, 0o755); err != nil {
                t.Fatal(err)
        }
        return &intelRdtTestUtil{config: config, IntelRdtPath: testIntelRdtPath, t: t}
}

// Write the specified contents on the mock of the specified Intel RDT "resource control" files
func (c *intelRdtTestUtil) writeFileContents(fileContents map[string]string) {
        for file, contents := range fileContents {
                err := writeFile(c.IntelRdtPath, file, contents)
                if err != nil {
                        c.t.Fatal(err)
                }
        }
}

//go:build linux

package userns

import (
        "bufio"
        "bytes"
        "fmt"
        "io"
        "os"
        "unsafe"

        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/sirupsen/logrus"
)

/*
#include <stdlib.h>
extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
*/
import "C"

func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
        scanner := bufio.NewScanner(bytes.NewReader(data))
        for scanner.Scan() {
                var m configs.IDMap
                line := scanner.Text()
                if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
                        return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
                }
                ms = append(ms, m)
        }
        if err := scanner.Err(); err != nil {
                return nil, fmt.Errorf("parsing id map failed: %w", err)
        }
        return ms, nil
}

// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
// efficiently. Returns the contents of the requested file from within the user
// namespace.
func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
        rdr, wtr, err := os.Pipe()
        if err != nil {
                return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
        }
        defer rdr.Close()
        defer wtr.Close()

        errRdr, errWtr, err := os.Pipe()
        if err != nil {
                return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
        }
        defer errRdr.Close()
        defer errWtr.Close()

        cNsPath := C.CString(nsPath)
        defer C.free(unsafe.Pointer(cNsPath))
        cPath := C.CString(path)
        defer C.free(unsafe.Pointer(cPath))

        childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))

        if childPid < 0 {
                return nil, fmt.Errorf("failed to spawn fork for userns")
        } else if childPid == 0 {
                // this should never happen
                panic("runc executing inside fork child -- unsafe state!")
        }

        // We are in the parent -- close the write end of the pipe before reading.
        wtr.Close()
        output, err := io.ReadAll(rdr)
        rdr.Close()
        if err != nil {
                return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
        }

        // Ditto for the error pipe.
        errWtr.Close()
        errOutput, err := io.ReadAll(errRdr)
        errRdr.Close()
        if err != nil {
                return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
        }
        errOutput = bytes.TrimSpace(errOutput)

        // Clean up the child.
        child, err := os.FindProcess(int(childPid))
        if err != nil {
                return nil, fmt.Errorf("could not find userns spawn process: %w", err)
        }
        state, err := child.Wait()
        if err != nil {
                return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
        }
        if !state.Success() {
                errStr := string(errOutput)
                if errStr == "" {
                        errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
                }
                return nil, fmt.Errorf("userns spawn: %s", errStr)
        } else if len(errOutput) > 0 {
                // We can just ignore weird output in the error pipe if the process
                // didn't bail(), but for completeness output for debugging.
                logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
        }
        // The subprocess succeeded, return whatever it wrote to the pipe.
        return output, nil
}

func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
        var (
                pid         int
                extra       rune
                tryFastPath bool
        )

        // nsPath is usually of the form /proc/<pid>/ns/user, which means that we
        // already have a pid that is part of the user namespace and thus we can
        // just use the pid to read from /proc/<pid>/*id_map.
        //
        // Note that Sscanf doesn't consume the whole input, so we check for any
        // trailing data with %c. That way, we can be sure the pattern matched
        // /proc/$pid/ns/user _exactly_ iff n === 1.
        if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
                tryFastPath = pid > 0
        }

        for _, mapType := range []struct {
                name  string
                idMap *[]configs.IDMap
        }{
                {"uid_map", &uidMap},
                {"gid_map", &gidMap},
        } {
                var mapData []byte

                if tryFastPath {
                        path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
                        data, err := os.ReadFile(path)
                        if err != nil {
                                // Do not error out here -- we need to try the slow path if the
                                // fast path failed.
                                logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
                        } else {
                                mapData = data
                        }
                } else {
                        logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
                }

                if mapData == nil {
                        // We have to actually join the namespace if we cannot take the
                        // fast path. The path is resolved with respect to the child
                        // process, so just use /proc/self.
                        data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
                        if err != nil {
                                return nil, nil, err
                        }
                        mapData = data
                }
                idMap, err := parseIdmapData(mapData)
                if err != nil {
                        return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
                }
                *mapType.idMap = idMap
        }

        return uidMap, gidMap, nil
}

// IsSameMapping returns whether or not the two id mappings are the same. Note
// that if the order of the mappings is different, or a mapping has been split,
// the mappings will be considered different.
func IsSameMapping(a, b []configs.IDMap) bool {
        if len(a) != len(b) {
                return false
        }
        for idx := range a {
                if a[idx] != b[idx] {
                        return false
                }
        }
        return true
}

package userns

import (
        "fmt"
        "os"
        "sort"
        "strings"
        "sync"
        "syscall"

        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/configs"
)

type Mapping struct {
        UIDMappings []configs.IDMap
        GIDMappings []configs.IDMap
}

func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
        for _, uid := range m.UIDMappings {
                uids = append(uids, syscall.SysProcIDMap{
                        ContainerID: int(uid.ContainerID),
                        HostID:      int(uid.HostID),
                        Size:        int(uid.Size),
                })
        }
        for _, gid := range m.GIDMappings {
                gids = append(gids, syscall.SysProcIDMap{
                        ContainerID: int(gid.ContainerID),
                        HostID:      int(gid.HostID),
                        Size:        int(gid.Size),
                })
        }
        return
}

// id returns a unique identifier for this mapping, agnostic of the order of
// the uid and gid mappings (because the order doesn't matter to the kernel).
// The set of userns handles is indexed using this ID.
func (m Mapping) id() string {
        var uids, gids []string
        for _, idmap := range m.UIDMappings {
                uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
        }
        for _, idmap := range m.GIDMappings {
                gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
        }
        // We don't care about the sort order -- just sort them.
        sort.Strings(uids)
        sort.Strings(gids)
        return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
}

type Handles struct {
        m    sync.Mutex
        maps map[string]*os.File
}

// Release all resources associated with this Handle. All existing files
// returned from Get() will continue to work even after calling Release(). The
// same Handles can be re-used after calling Release().
func (hs *Handles) Release() {
        hs.m.Lock()
        defer hs.m.Unlock()

        // Close the files for good measure, though GC will do that for us anyway.
        for _, file := range hs.maps {
                _ = file.Close()
        }
        hs.maps = nil
}

func spawnProc(req Mapping) (*os.Process, error) {
        // We need to spawn a subprocess with the requested mappings, which is
        // unfortunately quite expensive. The "safe" way of doing this is natively
        // with Go (and then spawning something like "sleep infinity"), but
        // execve() is a waste of cycles because we just need some process to have
        // the right mapping, we don't care what it's executing. The "unsafe"
        // option of doing a clone() behind the back of Go is probably okay in
        // theory as long as we just do kill(getpid(), SIGSTOP). However, if we
        // tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
        // the exec and not have to faff around with the mappings.
        //
        // Note that Go's stdlib does not support newuidmap, but in the case of
        // id-mapped mounts, it seems incredibly unlikely that the user will be
        // requesting us to do a remapping as an unprivileged user with mappings
        // they have privileges over.
        logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
        uidMappings, gidMappings := req.toSys()
        // We don't need to use /proc/thread-self here because the exe mm of a
        // thread-group is guaranteed to be the same for all threads by definition.
        // This lets us avoid having to do runtime.LockOSThread.
        return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
                Sys: &syscall.SysProcAttr{
                        Cloneflags:                 unix.CLONE_NEWUSER,
                        UidMappings:                uidMappings,
                        GidMappings:                gidMappings,
                        GidMappingsEnableSetgroups: false,
                        // Put the process into PTRACE_TRACEME mode to allow us to get the
                        // userns without having a proper execve() target.
                        Ptrace: true,
                },
        })
}

func dupFile(f *os.File) (*os.File, error) {
        newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
        if err != nil {
                return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
        }
        return os.NewFile(uintptr(newFd), f.Name()), nil
}

// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
// mapping. The processes spawned to produce userns nsfds are cached, so if
// equivalent user namespace mappings are requested, the same user namespace
// will be returned. The caller is responsible for closing the returned file
// descriptor.
func (hs *Handles) Get(req Mapping) (file *os.File, err error) {
        hs.m.Lock()
        defer hs.m.Unlock()

        if hs.maps == nil {
                hs.maps = make(map[string]*os.File)
        }

        file, ok := hs.maps[req.id()]
        if !ok {
                proc, err := spawnProc(req)
                if err != nil {
                        return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
                }
                // Make sure we kill the helper process. We ignore errors because
                // there's not much we can do about them anyway, and ultimately
                defer func() {
                        _ = proc.Kill()
                        _, _ = proc.Wait()
                }()

                // Stash away a handle to the userns file. This is neater than keeping
                // the process alive, because Go's GC can handle files much better than
                // leaked processes, and having long-living useless processes seems
                // less than ideal.
                file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
                if err != nil {
                        return nil, err
                }
                hs.maps[req.id()] = file
        }
        // Duplicate the file, to make sure the lifecycle of each *os.File we
        // return is independent.
        return dupFile(file)
}

package keys

import (
        "errors"
        "fmt"
        "strconv"
        "strings"

        "golang.org/x/sys/unix"
)

type KeySerial uint32

func JoinSessionKeyring(name string) (KeySerial, error) {
        sessKeyID, err := unix.KeyctlJoinSessionKeyring(name)
        if err != nil {
                return 0, fmt.Errorf("unable to create session key: %w", err)
        }
        return KeySerial(sessKeyID), nil
}

// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringID KeySerial, mask, setbits uint32) error {
        dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringID))
        if err != nil {
                return err
        }

        res := strings.Split(dest, ";")
        if len(res) < 5 {
                return errors.New("Destination buffer for key description is too small")
        }

        // parse permissions
        perm64, err := strconv.ParseUint(res[3], 16, 32)
        if err != nil {
                return err
        }

        perm := (uint32(perm64) & mask) | setbits

        return unix.KeyctlSetperm(int(ringID), perm)
}

//go:build gofuzz
// +build gofuzz

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package libcontainer

import (
        "os"

        gofuzzheaders "github.com/AdaLogics/go-fuzz-headers"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/system"
        "github.com/sirupsen/logrus"
)

func FuzzStateApi(data []byte) int {
        // We do not want any log output:
        logrus.SetLevel(logrus.PanicLevel)

        if len(data) < 4 {
                return -1
        }

        // Create the root dir:
        root, err := newTestRoot()
        if err != nil {
                return -1
        }
        defer os.RemoveAll(root)

        // Create a fuzzconsumer for later user
        c := gofuzzheaders.NewConsumer(data)

        // Create a config
        config := new(configs.Config)
        err = c.GenerateStruct(config)
        if err != nil {
                return 0
        }
        config.Rootfs = root

        // Add Namespaces:
        ns, err := c.GetInt()
        if err != nil {
                return -1
        }
        if (ns % 3) == 0 {
                config.Namespaces = configs.Namespaces(
                        []configs.Namespace{
                                {Type: configs.NEWUTS},
                        },
                )
        } else if (ns % 4) == 0 {
                config.Namespaces = configs.Namespaces(
                        []configs.Namespace{
                                {Type: configs.NEWNS},
                        },
                )
        } else {
                config.Namespaces = []configs.Namespace{}
        }

        container := newContainerWithName()
        if container == nil {
                return 0
        }
        defer container.Destroy()

        err = container.Set(*config)
        if err != nil {
                return 0
        }

        // Fuzz container APIs:
        _, _ = container.State()
        _, _ = container.Stats()
        _, _ = container.OCIState()
        _, _ = container.Processes()

        process := &Process{}
        err = c.GenerateStruct(process)
        if err != nil {
                return 0
        }
        _ = container.Run(process)
        err = container.Pause()
        if err == nil {
                container.Resume()
        }

        process = &Process{}
        err = c.GenerateStruct(process)
        if err != nil {
                return 0
        }

        criuOpts := &CriuOpts{}
        err = c.GenerateStruct(criuOpts)
        if err != nil {
                return 0
        }
        _ = container.Restore(process, criuOpts)

        return 1
}

func newContainerWithName() *Container {
        pid := 1
        stat, err := system.Stat(pid)
        if err != nil {
                return nil
        }
        container := &Container{
                id:     "myid",
                config: &configs.Config{},
                cgroupManager: &mockCgroupManager{
                        allPids: []int{1, 2, 3},
                        paths: map[string]string{
                                "device": "/proc/self/cgroups",
                        },
                },
                initProcess: &mockProcess{
                        _pid:    1,
                        started: 10,
                },
                initProcessStartTime: stat.StartTime,
        }
        container.state = &runningState{c: container}
        return container
}

func newTestRoot() (string, error) {
        dir := "/tmp/fuzzing"
        if err := os.MkdirAll(dir, 0700); err != nil {
                return "", err
        }
        return dir, nil
}

package logs

import (
        "bufio"
        "encoding/json"
        "io"

        "github.com/sirupsen/logrus"
)

func ForwardLogs(logPipe io.ReadCloser) chan error {
        done := make(chan error, 1)
        s := bufio.NewScanner(logPipe)

        logger := logrus.StandardLogger()
        if logger.ReportCaller {
                // Need a copy of the standard logger, but with ReportCaller
                // turned off, as the logs are merely forwarded and their
                // true source is not this file/line/function.
                logNoCaller := *logrus.StandardLogger()
                logNoCaller.ReportCaller = false
                logger = &logNoCaller
        }

        go func() {
                for s.Scan() {
                        processEntry(s.Bytes(), logger)
                }
                if err := logPipe.Close(); err != nil {
                        logrus.Errorf("error closing log source: %v", err)
                }
                // The only error we want to return is when reading from
                // logPipe has failed.
                done <- s.Err()
                close(done)
        }()

        return done
}

func processEntry(text []byte, logger *logrus.Logger) {
        if len(text) == 0 {
                return
        }

        var jl struct {
                Level logrus.Level `json:"level"`
                Msg   string       `json:"msg"`
        }
        if err := json.Unmarshal(text, &jl); err != nil {
                logrus.Errorf("failed to decode %q to json: %v", text, err)
                return
        }

        logger.Log(jl.Level, jl.Msg)
}

package libcontainer

import (
        "fmt"
        "math"

        "github.com/vishvananda/netlink/nl"
        "golang.org/x/sys/unix"
)

// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
        InitMsg          uint16 = 62000
        CloneFlagsAttr   uint16 = 27281
        NsPathsAttr      uint16 = 27282
        UidmapAttr       uint16 = 27283
        GidmapAttr       uint16 = 27284
        SetgroupAttr     uint16 = 27285
        OomScoreAdjAttr  uint16 = 27286
        RootlessEUIDAttr uint16 = 27287
        UidmapPathAttr   uint16 = 27288
        GidmapPathAttr   uint16 = 27289
        TimeOffsetsAttr  uint16 = 27290
)

type Int32msg struct {
        Type  uint16
        Value uint32
}

// Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value             |
func (msg *Int32msg) Serialize() []byte {
        buf := make([]byte, msg.Len())
        native := nl.NativeEndian()
        native.PutUint16(buf[0:2], uint16(msg.Len()))
        native.PutUint16(buf[2:4], msg.Type)
        native.PutUint32(buf[4:8], msg.Value)
        return buf
}

func (msg *Int32msg) Len() int {
        return unix.NLA_HDRLEN + 4
}

// Bytemsg has the following representation
// | nlattr len | nlattr type |
// | value              | pad |
type Bytemsg struct {
        Type  uint16
        Value []byte
}

func (msg *Bytemsg) Serialize() []byte {
        l := msg.Len()
        if l > math.MaxUint16 {
                // We cannot return nil nor an error here, so we panic with
                // a specific type instead, which is handled via recover in
                // bootstrapData.
                panic(netlinkError{fmt.Errorf("netlink: cannot serialize bytemsg of length %d (larger than UINT16_MAX)", l)})
        }
        buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1))
        native := nl.NativeEndian()
        native.PutUint16(buf[0:2], uint16(l))
        native.PutUint16(buf[2:4], msg.Type)
        copy(buf[4:], msg.Value)
        return buf
}

func (msg *Bytemsg) Len() int {
        return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}

type Boolmsg struct {
        Type  uint16
        Value bool
}

func (msg *Boolmsg) Serialize() []byte {
        buf := make([]byte, msg.Len())
        native := nl.NativeEndian()
        native.PutUint16(buf[0:2], uint16(msg.Len()))
        native.PutUint16(buf[2:4], msg.Type)
        if msg.Value {
                native.PutUint32(buf[4:8], uint32(1))
        } else {
                native.PutUint32(buf[4:8], uint32(0))
        }
        return buf
}

func (msg *Boolmsg) Len() int {
        return unix.NLA_HDRLEN + 4 // alignment
}

package libcontainer

import (
        "errors"
        "fmt"
        "io/fs"
        "os"
        "strconv"

        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/internal/userns"
        "github.com/opencontainers/runc/libcontainer/utils"
)

// mountSourceType indicates what type of file descriptor is being returned. It
// is used to tell rootfs_linux.go whether or not to use move_mount(2) to
// install the mount.
type mountSourceType string

const (
        // An open_tree(2)-style file descriptor that needs to be installed using
        // move_mount(2) to install.
        mountSourceOpenTree mountSourceType = "open_tree"
        // A plain file descriptor that can be mounted through /proc/thread-self/fd.
        mountSourcePlain mountSourceType = "plain-open"
)

type mountSource struct {
        Type mountSourceType `json:"type"`
        file *os.File        `json:"-"`
}

// mountError holds an error from a failed mount or unmount operation.
type mountError struct {
        op      string
        source  string
        srcFile *mountSource
        target  string
        dstFd   string
        flags   uintptr
        data    string
        err     error
}

// Error provides a string error representation.
func (e *mountError) Error() string {
        out := e.op + " "

        if e.source != "" {
                out += "src=" + e.source + ", "
                if e.srcFile != nil {
                        out += "srcType=" + string(e.srcFile.Type) + ", "
                        out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", "
                }
        }
        out += "dst=" + e.target
        if e.dstFd != "" {
                out += ", dstFd=" + e.dstFd
        }

        if e.flags != uintptr(0) {
                out += ", flags=0x" + strconv.FormatUint(uint64(e.flags), 16)
        }
        if e.data != "" {
                out += ", data=" + e.data
        }

        out += ": " + e.err.Error()
        return out
}

// Unwrap returns the underlying error.
// This is a convention used by Go 1.13+ standard library.
func (e *mountError) Unwrap() error {
        return e.err
}

// mount is a simple unix.Mount wrapper, returning an error with more context
// in case it failed.
func mount(source, target, fstype string, flags uintptr, data string) error {
        return mountViaFds(source, nil, target, "", fstype, flags, data)
}

// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source,
// and dstFd instead of target, unless those are empty.
//
// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds
// will mount it according to the mountSourceType of the file descriptor.
//
// The dstFd argument, if non-empty, is expected to be in the form of a path to
// an opened file descriptor on procfs (i.e. "/proc/thread-self/fd/NN").
//
// If a file descriptor is used instead of a source or a target path, the
// corresponding path is only used to add context to an error in case the mount
// operation has failed.
func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error {
        // MS_REMOUNT and srcFile don't make sense together.
        if srcFile != nil && flags&unix.MS_REMOUNT != 0 {
                logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
                srcFile = nil
        }
        dst := target
        if dstFd != "" {
                dst = dstFd
        }
        src := source
        isMoveMount := srcFile != nil && srcFile.Type == mountSourceOpenTree
        if srcFile != nil {
                // If we're going to use the /proc/thread-self/... path for classic
                // mount(2), we need to get a safe handle to /proc/thread-self. This
                // isn't needed for move_mount(2) because in that case the path is just
                // a dummy string used for error info.
                srcFileFd := srcFile.file.Fd()
                if isMoveMount {
                        src = "/proc/self/fd/" + strconv.Itoa(int(srcFileFd))
                } else {
                        var closer utils.ProcThreadSelfCloser
                        src, closer = utils.ProcThreadSelfFd(srcFileFd)
                        defer closer()
                }
        }

        var op string
        var err error
        if isMoveMount {
                op = "move_mount"
                err = unix.MoveMount(int(srcFile.file.Fd()), "",
                        unix.AT_FDCWD, dstFd,
                        unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS)
        } else {
                op = "mount"
                err = unix.Mount(src, dst, fstype, flags, data)
        }
        if err != nil {
                return &mountError{
                        op:      op,
                        source:  source,
                        srcFile: srcFile,
                        target:  target,
                        dstFd:   dstFd,
                        flags:   flags,
                        data:    data,
                        err:     err,
                }
        }
        return nil
}

// unmount is a simple unix.Unmount wrapper.
func unmount(target string, flags int) error {
        err := unix.Unmount(target, flags)
        if err != nil {
                return &mountError{
                        op:     "unmount",
                        target: target,
                        flags:  uintptr(flags),
                        err:    err,
                }
        }
        return nil
}

// syscallMode returns the syscall-specific mode bits from Go's portable mode bits.
// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75
func syscallMode(i fs.FileMode) (o uint32) {
        o |= uint32(i.Perm())
        if i&fs.ModeSetuid != 0 {
                o |= unix.S_ISUID
        }
        if i&fs.ModeSetgid != 0 {
                o |= unix.S_ISGID
        }
        if i&fs.ModeSticky != 0 {
                o |= unix.S_ISVTX
        }
        // No mapping for Go's ModeTemporary (plan9 only).
        return
}

// mountFd creates a "mount source fd" (either through open_tree(2) or just
// open(O_PATH)) based on the provided configuration. This function must be
// called from within the container's mount namespace.
//
// In the case of idmapped mount configurations, the returned mount source will
// be an open_tree(2) file with MOUNT_ATTR_IDMAP applied. For other
// bind-mounts, it will be an O_PATH. If the type of mount cannot be handled,
// the returned mountSource will be nil, indicating that the container init
// process will need to do an old-fashioned mount(2) themselves.
//
// This helper is only intended to be used by goCreateMountSources.
func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error) {
        if !m.IsBind() {
                return nil, errors.New("new mount api: only bind-mounts are supported")
        }
        if nsHandles == nil {
                nsHandles = new(userns.Handles)
                defer nsHandles.Release()
        }

        var mountFile *os.File
        var sourceType mountSourceType

        // Ideally, we would use OPEN_TREE_CLONE for everything, because we can
        // be sure that the file descriptor cannot be used to escape outside of
        // the mount root. Unfortunately, OPEN_TREE_CLONE is far more expensive
        // than open(2) because it requires doing mounts inside a new anonymous
        // mount namespace. So we use open(2) for standard bind-mounts, and
        // OPEN_TREE_CLONE when we need to set mount attributes here.
        //
        // While passing open(2)'d paths from the host rootfs isn't exactly the
        // safest thing in the world, the files will not survive across
        // execve(2) and "runc init" is non-dumpable so it should not be
        // possible for a malicious container process to gain access to the
        // file descriptors. We also don't do any of this for "runc exec",
        // lessening the risk even further.
        if m.IsIDMapped() {
                flags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC)
                if m.Flags&unix.MS_REC == unix.MS_REC {
                        flags |= unix.AT_RECURSIVE
                }
                fd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
                if err != nil {
                        return nil, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
                }
                mountFile = os.NewFile(uintptr(fd), m.Source)
                sourceType = mountSourceOpenTree

                // Configure the id mapping.
                var usernsFile *os.File
                if m.IDMapping.UserNSPath == "" {
                        usernsFile, err = nsHandles.Get(userns.Mapping{
                                UIDMappings: m.IDMapping.UIDMappings,
                                GIDMappings: m.IDMapping.GIDMappings,
                        })
                        if err != nil {
                                return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
                        }
                } else {
                        usernsFile, err = os.Open(m.IDMapping.UserNSPath)
                        if err != nil {
                                return nil, fmt.Errorf("failed to open existing userns for %s id-mapping: %w", m.Source, err)
                        }
                }
                defer usernsFile.Close()

                setAttrFlags := uint(unix.AT_EMPTY_PATH)
                // If the mount has "ridmap" set, we apply the configuration
                // recursively. This allows you to create "rbind" mounts where only
                // the top-level mount has an idmapping. I'm not sure why you'd
                // want that, but still...
                if m.IDMapping.Recursive {
                        setAttrFlags |= unix.AT_RECURSIVE
                }
                if err := unix.MountSetattr(int(mountFile.Fd()), "", setAttrFlags, &unix.MountAttr{
                        Attr_set:  unix.MOUNT_ATTR_IDMAP,
                        Userns_fd: uint64(usernsFile.Fd()),
                }); err != nil {
                        extraMsg := ""
                        if err == unix.EINVAL {
                                extraMsg = " (maybe the filesystem used doesn't support idmap mounts on this kernel?)"
                        }

                        return nil, fmt.Errorf("failed to set MOUNT_ATTR_IDMAP on %s: %w%s", m.Source, err, extraMsg)
                }
        } else {
                var err error
                mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0)
                if err != nil {
                        return nil, err
                }
                sourceType = mountSourcePlain
        }
        return &mountSource{
                Type: sourceType,
                file: mountFile,
        }, nil
}

package libcontainer

import (
        "bytes"
        "fmt"
        "os"
        "path/filepath"
        "strconv"

        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/types"
        "github.com/vishvananda/netlink"
)

var strategies = map[string]networkStrategy{
        "loopback": &loopback{},
}

// networkStrategy represents a specific network configuration for
// a container's networking stack
type networkStrategy interface {
        create(*network, int) error
        initialize(*network) error
        detach(*configs.Network) error
        attach(*configs.Network) error
}

// getStrategy returns the specific network strategy for the
// provided type.
func getStrategy(tpe string) (networkStrategy, error) {
        s, exists := strategies[tpe]
        if !exists {
                return nil, fmt.Errorf("unknown strategy type %q", tpe)
        }
        return s, nil
}

// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, error) {
        out := &types.NetworkInterface{Name: interfaceName}
        // This can happen if the network runtime information is missing - possible if the
        // container was created by an old version of libcontainer.
        if interfaceName == "" {
                return out, nil
        }
        type netStatsPair struct {
                // Where to write the output.
                Out *uint64
                // The network stats file to read.
                File string
        }
        // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
        netStats := []netStatsPair{
                {Out: &out.RxBytes, File: "tx_bytes"},
                {Out: &out.RxPackets, File: "tx_packets"},
                {Out: &out.RxErrors, File: "tx_errors"},
                {Out: &out.RxDropped, File: "tx_dropped"},

                {Out: &out.TxBytes, File: "rx_bytes"},
                {Out: &out.TxPackets, File: "rx_packets"},
                {Out: &out.TxErrors, File: "rx_errors"},
                {Out: &out.TxDropped, File: "rx_dropped"},
        }
        for _, netStat := range netStats {
                data, err := readSysfsNetworkStats(interfaceName, netStat.File)
                if err != nil {
                        return nil, err
                }
                *(netStat.Out) = data
        }
        return out, nil
}

// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
        data, err := os.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
        if err != nil {
                return 0, err
        }
        return strconv.ParseUint(string(bytes.TrimSpace(data)), 10, 64)
}

// loopback is a network strategy that provides a basic loopback device
type loopback struct{}

func (l *loopback) create(n *network, nspid int) error {
        return nil
}

func (l *loopback) initialize(config *network) error {
        return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
}

func (l *loopback) attach(n *configs.Network) (err error) {
        return nil
}

func (l *loopback) detach(n *configs.Network) (err error) {
        return nil
}

package libcontainer

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"

        "golang.org/x/sys/unix"
)

type PressureLevel uint

const (
        LowPressure PressureLevel = iota
        MediumPressure
        CriticalPressure
)

func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
        evFile, err := os.Open(filepath.Join(cgDir, evName))
        if err != nil {
                return nil, err
        }
        fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC)
        if err != nil {
                evFile.Close()
                return nil, err
        }

        eventfd := os.NewFile(uintptr(fd), "eventfd")

        eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
        data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
        if err := os.WriteFile(eventControlPath, []byte(data), 0o700); err != nil {
                eventfd.Close()
                evFile.Close()
                return nil, err
        }
        ch := make(chan struct{})
        go func() {
                defer func() {
                        eventfd.Close()
                        evFile.Close()
                        close(ch)
                }()
                buf := make([]byte, 8)
                for {
                        if _, err := eventfd.Read(buf); err != nil {
                                return
                        }
                        // When a cgroup is destroyed, an event is sent to eventfd.
                        // So if the control path is gone, return instead of notifying.
                        if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) {
                                return
                        }
                        ch <- struct{}{}
                }
        }()
        return ch, nil
}

// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOM(dir string) (<-chan struct{}, error) {
        if dir == "" {
                return nil, errors.New("memory controller missing")
        }

        return registerMemoryEvent(dir, "memory.oom_control", "")
}

func notifyMemoryPressure(dir string, level PressureLevel) (<-chan struct{}, error) {
        if dir == "" {
                return nil, errors.New("memory controller missing")
        }

        if level > CriticalPressure {
                return nil, fmt.Errorf("invalid pressure level %d", level)
        }

        levelStr := []string{"low", "medium", "critical"}[level]
        return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
}

package libcontainer

import (
        "fmt"
        "os"
        "path/filepath"
        "unsafe"

        "github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) {
        fd, err := unix.InotifyInit()
        if err != nil {
                return nil, fmt.Errorf("unable to init inotify: %w", err)
        }
        // watching oom kill
        evFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, evName), unix.IN_MODIFY)
        if err != nil {
                unix.Close(fd)
                return nil, fmt.Errorf("unable to add inotify watch: %w", err)
        }
        // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
        cgFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, cgEvName), unix.IN_MODIFY)
        if err != nil {
                unix.Close(fd)
                return nil, fmt.Errorf("unable to add inotify watch: %w", err)
        }
        ch := make(chan struct{})
        go func() {
                var (
                        buffer [unix.SizeofInotifyEvent + unix.PathMax + 1]byte
                        offset uint32
                )
                defer func() {
                        unix.Close(fd)
                        close(ch)
                }()

                for {
                        n, err := unix.Read(fd, buffer[:])
                        if err == unix.EINTR { //nolint:errorlint // unix errors are bare
                                continue
                        }
                        if err != nil {
                                err = os.NewSyscallError("read", err)
                                logrus.Warnf("unable to read event data from inotify, got error: %v", err)
                                return
                        }
                        if n < unix.SizeofInotifyEvent {
                                logrus.Warnf("we should read at least %d bytes from inotify, but got %d bytes.", unix.SizeofInotifyEvent, n)
                                return
                        }
                        offset = 0
                        for offset <= uint32(n-unix.SizeofInotifyEvent) {
                                rawEvent := (*unix.InotifyEvent)(unsafe.Pointer(&buffer[offset]))
                                offset += unix.SizeofInotifyEvent + rawEvent.Len
                                if rawEvent.Mask&unix.IN_MODIFY != unix.IN_MODIFY {
                                        continue
                                }
                                switch int(rawEvent.Wd) {
                                case evFd:
                                        oom, err := fscommon.GetValueByKey(cgDir, evName, "oom_kill")
                                        if err != nil || oom > 0 {
                                                ch <- struct{}{}
                                        }
                                case cgFd:
                                        pids, err := fscommon.GetValueByKey(cgDir, cgEvName, "populated")
                                        if err != nil || pids == 0 {
                                                return
                                        }
                                }
                        }
                }
        }()
        return ch, nil
}

// notifyOnOOMV2 returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOMV2(path string) (<-chan struct{}, error) {
        return registerMemoryEventV2(path, "memory.events", "cgroup.events")
}

package libcontainer

import (
        "errors"
        "io"
        "math"
        "os"

        "github.com/opencontainers/runc/libcontainer/configs"
)

var errInvalidProcess = errors.New("invalid process")

type processOperations interface {
        wait() (*os.ProcessState, error)
        signal(sig os.Signal) error
        pid() int
}

// Process defines the configuration and IO for a process inside a container.
//
// Note that some Process properties are also present in container configuration
// ([configs.Config]). In all such cases, Process properties take precedence
// over container configuration ones.
type Process struct {
        // The command to be run followed by any arguments.
        Args []string

        // Env specifies the environment variables for the process.
        Env []string

        // UID and GID of the executing process running inside the container
        // local to the container's user and group configuration.
        UID, GID int

        // AdditionalGroups specifies the gids that should be added to supplementary groups
        // in addition to those that the user belongs to.
        AdditionalGroups []int

        // Cwd will change the process's current working directory inside the container's rootfs.
        Cwd string

        // Stdin is a reader which provides the standard input stream.
        Stdin io.Reader

        // Stdout is a writer which receives the standard output stream.
        Stdout io.Writer

        // Stderr is a writer which receives the standard error stream.
        Stderr io.Writer

        // ExtraFiles specifies additional open files to be inherited by the process.
        ExtraFiles []*os.File

        // Open handles to cloned binaries -- see exeseal.CloneSelfExe for more details.
        clonedExes []*os.File

        // Initial size for the console.
        ConsoleWidth  uint16
        ConsoleHeight uint16

        // Capabilities specify the capabilities to keep when executing the process.
        // All capabilities not specified will be dropped from the processes capability mask.
        //
        // If not nil, takes precedence over container's [configs.Config.Capabilities].
        Capabilities *configs.Capabilities

        // AppArmorProfile specifies the profile to apply to the process and is
        // changed at the time the process is executed.
        //
        // If not empty, takes precedence over container's [configs.Config.AppArmorProfile].
        AppArmorProfile string

        // Label specifies the label to apply to the process. It is commonly used by selinux.
        //
        // If not empty, takes precedence over container's [configs.Config.ProcessLabel].
        Label string

        // NoNewPrivileges controls whether processes can gain additional privileges.
        //
        // If not nil, takes precedence over container's [configs.Config.NoNewPrivileges].
        NoNewPrivileges *bool

        // Rlimits specifies the resource limits, such as max open files, to set for the process.
        // If unset, the process will inherit rlimits from the parent process.
        //
        // If not empty, takes precedence over container's [configs.Config.Rlimit].
        Rlimits []configs.Rlimit

        // ConsoleSocket provides the masterfd console.
        ConsoleSocket *os.File

        // PidfdSocket provides process file descriptor of it own.
        PidfdSocket *os.File

        // Init specifies whether the process is the first process in the container.
        Init bool

        ops processOperations

        // LogLevel is a string containing a numeric representation of the current
        // log level (i.e. "4", but never "info"). It is passed on to runc init as
        // _LIBCONTAINER_LOGLEVEL environment variable.
        LogLevel string

        // SubCgroupPaths specifies sub-cgroups to run the process in.
        // Map keys are controller names, map values are paths (relative to
        // container's top-level cgroup).
        //
        // If empty, the default top-level container's cgroup is used.
        //
        // For cgroup v2, the only key allowed is "".
        SubCgroupPaths map[string]string

        // Scheduler represents the scheduling attributes for a process.
        //
        // If not empty, takes precedence over container's [configs.Config.Scheduler].
        Scheduler *configs.Scheduler

        // IOPriority is a process I/O priority.
        //
        // If not empty, takes precedence over container's [configs.Config.IOPriority].
        IOPriority *configs.IOPriority
}

// Wait waits for the process to exit.
// Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) {
        if p.ops == nil {
                return nil, errInvalidProcess
        }
        return p.ops.wait()
}

// Pid returns the process ID
func (p Process) Pid() (int, error) {
        // math.MinInt32 is returned here, because it's invalid value
        // for the kill() system call.
        if p.ops == nil {
                return math.MinInt32, errInvalidProcess
        }
        return p.ops.pid(), nil
}

// Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error {
        if p.ops == nil {
                return errInvalidProcess
        }
        return p.ops.signal(sig)
}

// closeClonedExes cleans up any existing cloned binaries associated with the
// Process.
func (p *Process) closeClonedExes() {
        for _, exe := range p.clonedExes {
                _ = exe.Close()
        }
        p.clonedExes = nil
}

// IO holds the process's STDIO
type IO struct {
        Stdin  io.WriteCloser
        Stdout io.ReadCloser
        Stderr io.ReadCloser
}

package libcontainer

import (
        "context"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net"
        "os"
        "os/exec"
        "path/filepath"
        "runtime"
        "strconv"
        "sync"
        "time"

        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/intelrdt"
        "github.com/opencontainers/runc/libcontainer/internal/userns"
        "github.com/opencontainers/runc/libcontainer/logs"
        "github.com/opencontainers/runc/libcontainer/system"
        "github.com/opencontainers/runc/libcontainer/utils"
)

type parentProcess interface {
        // pid returns the pid for the running process.
        pid() int

        // start starts the process execution.
        start() error

        // send a SIGKILL to the process and wait for the exit.
        terminate() error

        // wait waits on the process returning the process state.
        wait() (*os.ProcessState, error)

        // startTime returns the process start time.
        startTime() (uint64, error)
        signal(os.Signal) error
        externalDescriptors() []string
        setExternalDescriptors(fds []string)
        forwardChildLogs() chan error
}

type processComm struct {
        // Used to send initial configuration to "runc init" and for "runc init" to
        // indicate that it is ready.
        initSockParent *os.File
        initSockChild  *os.File
        // Used for control messages between parent and "runc init".
        syncSockParent *syncSocket
        syncSockChild  *syncSocket
        // Used for log forwarding from "runc init" to the parent.
        logPipeParent *os.File
        logPipeChild  *os.File
}

func newProcessComm() (*processComm, error) {
        var (
                comm processComm
                err  error
        )
        comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init")
        if err != nil {
                return nil, fmt.Errorf("unable to create init pipe: %w", err)
        }
        comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync")
        if err != nil {
                return nil, fmt.Errorf("unable to create sync pipe: %w", err)
        }
        comm.logPipeParent, comm.logPipeChild, err = os.Pipe()
        if err != nil {
                return nil, fmt.Errorf("unable to create log pipe: %w", err)
        }
        return &comm, nil
}

func (c *processComm) closeChild() {
        _ = c.initSockChild.Close()
        _ = c.syncSockChild.Close()
        _ = c.logPipeChild.Close()
}

func (c *processComm) closeParent() {
        _ = c.initSockParent.Close()
        _ = c.syncSockParent.Close()
        // c.logPipeParent is kept alive for ForwardLogs
}

type containerProcess struct {
        cmd           *exec.Cmd
        comm          *processComm
        config        *initConfig
        manager       cgroups.Manager
        fds           []string
        process       *Process
        bootstrapData io.Reader
        container     *Container
}

func (p *containerProcess) pid() int {
        return p.cmd.Process.Pid
}

func (p *containerProcess) startTime() (uint64, error) {
        stat, err := system.Stat(p.pid())
        return stat.StartTime, err
}

func (p *containerProcess) signal(sig os.Signal) error {
        s, ok := sig.(unix.Signal)
        if !ok {
                return errors.New("os: unsupported signal type")
        }
        return unix.Kill(p.pid(), s)
}

func (p *containerProcess) externalDescriptors() []string {
        return p.fds
}

func (p *containerProcess) setExternalDescriptors(newFds []string) {
        p.fds = newFds
}

func (p *containerProcess) forwardChildLogs() chan error {
        return logs.ForwardLogs(p.comm.logPipeParent)
}

// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becoming a zombie.
func (p *containerProcess) terminate() error {
        if p.cmd.Process == nil {
                return nil
        }
        err := p.cmd.Process.Kill()
        if _, werr := p.wait(); err == nil {
                err = werr
        }
        return err
}

func (p *containerProcess) wait() (*os.ProcessState, error) { //nolint:unparam
        err := p.cmd.Wait()

        // Return actual ProcessState even on Wait error
        return p.cmd.ProcessState, err
}

type setnsProcess struct {
        containerProcess
        cgroupPaths     map[string]string
        rootlessCgroups bool
        intelRdtPath    string
        initProcessPid  int
}

func (p *setnsProcess) start() (retErr error) {
        defer p.comm.closeParent()

        // get the "before" value of oom kill count
        oom, _ := p.manager.OOMKillCount()
        err := p.cmd.Start()
        // close the child-side of the pipes (controlled by child)
        p.comm.closeChild()
        if err != nil {
                return fmt.Errorf("error starting setns process: %w", err)
        }

        defer func() {
                if retErr != nil {
                        if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
                                // Someone in this cgroup was killed, this _might_ be us.
                                retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
                        }
                        err := ignoreTerminateErrors(p.terminate())
                        if err != nil {
                                logrus.WithError(err).Warn("unable to terminate setnsProcess")
                        }
                }
        }()

        if p.bootstrapData != nil {
                if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
                        return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
                }
        }
        if err := p.execSetns(); err != nil {
                return fmt.Errorf("error executing setns process: %w", err)
        }
        for _, path := range p.cgroupPaths {
                if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
                        // On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
                        // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
                        // Try to join the cgroup of InitProcessPid.
                        if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
                                initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
                                initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
                                if initCgErr == nil {
                                        if initCgPath, ok := initCg[""]; ok {
                                                initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
                                                logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
                                                        p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
                                                // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
                                                err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
                                        }
                                }
                        }
                        if err != nil {
                                return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
                        }
                }
        }
        if p.intelRdtPath != "" {
                // if Intel RDT "resource control" filesystem path exists
                _, err := os.Stat(p.intelRdtPath)
                if err == nil {
                        if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
                                return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
                        }
                }
        }

        if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
                return fmt.Errorf("error writing config to pipe: %w", err)
        }

        var seenProcReady bool
        ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
                switch sync.Type {
                case procReady:
                        seenProcReady = true
                        // Set rlimits, this has to be done here because we lose permissions
                        // to raise the limits once we enter a user-namespace
                        if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
                                return fmt.Errorf("error setting rlimits for ready process: %w", err)
                        }

                        // Sync with child.
                        if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
                                return err
                        }
                case procHooks:
                        // This shouldn't happen.
                        panic("unexpected procHooks in setns")
                case procMountPlease:
                        // This shouldn't happen.
                        panic("unexpected procMountPlease in setns")
                case procSeccomp:
                        if p.config.Config.Seccomp.ListenerPath == "" {
                                return errors.New("seccomp listenerPath is not set")
                        }
                        if sync.Arg == nil {
                                return fmt.Errorf("sync %q is missing an argument", sync.Type)
                        }
                        var srcFd int
                        if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
                                return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
                        }
                        seccompFd, err := pidGetFd(p.pid(), srcFd)
                        if err != nil {
                                return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
                        }
                        defer seccompFd.Close()
                        // We have a copy, the child can keep working. We don't need to
                        // wait for the seccomp notify listener to get the fd before we
                        // permit the child to continue because the child will happily wait
                        // for the listener if it hits SCMP_ACT_NOTIFY.
                        if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil {
                                return err
                        }

                        bundle, annotations := utils.Annotations(p.config.Config.Labels)
                        containerProcessState := &specs.ContainerProcessState{
                                Version:  specs.Version,
                                Fds:      []string{specs.SeccompFdName},
                                Pid:      p.cmd.Process.Pid,
                                Metadata: p.config.Config.Seccomp.ListenerMetadata,
                                State: specs.State{
                                        Version:     specs.Version,
                                        ID:          p.config.ContainerID,
                                        Status:      specs.StateRunning,
                                        Pid:         p.initProcessPid,
                                        Bundle:      bundle,
                                        Annotations: annotations,
                                },
                        }
                        if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
                                containerProcessState, seccompFd); err != nil {
                                return err
                        }
                default:
                        return errors.New("invalid JSON payload from child")
                }
                return nil
        })

        if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
                return err
        }
        if !seenProcReady && ierr == nil {
                ierr = errors.New("procReady not received")
        }
        // Must be done after Shutdown so the child will exit and we can wait for it.
        if ierr != nil {
                _, _ = p.wait()
                return ierr
        }
        return nil
}

// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
        status, err := p.cmd.Process.Wait()
        if err != nil {
                _ = p.cmd.Wait()
                return fmt.Errorf("error waiting on setns process to finish: %w", err)
        }
        if !status.Success() {
                _ = p.cmd.Wait()
                return &exec.ExitError{ProcessState: status}
        }
        var pid *pid
        if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil {
                _ = p.cmd.Wait()
                return fmt.Errorf("error reading pid from init pipe: %w", err)
        }

        // Clean up the zombie parent process
        // On Unix systems FindProcess always succeeds.
        firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)

        // Ignore the error in case the child has already been reaped for any reason
        _, _ = firstChildProcess.Wait()

        process, err := os.FindProcess(pid.Pid)
        if err != nil {
                return err
        }
        p.cmd.Process = process
        p.process.ops = p
        return nil
}

type initProcess struct {
        containerProcess
        intelRdtManager *intelrdt.Manager
}

// getChildPid receives the final child's pid over the provided pipe.
func (p *initProcess) getChildPid() (int, error) {
        var pid pid
        if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil {
                _ = p.cmd.Wait()
                return -1, err
        }

        // Clean up the zombie parent process
        // On Unix systems FindProcess always succeeds.
        firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)

        // Ignore the error in case the child has already been reaped for any reason
        _, _ = firstChildProcess.Wait()

        return pid.Pid, nil
}

func (p *initProcess) waitForChildExit(childPid int) error {
        status, err := p.cmd.Process.Wait()
        if err != nil {
                _ = p.cmd.Wait()
                return err
        }
        if !status.Success() {
                _ = p.cmd.Wait()
                return &exec.ExitError{ProcessState: status}
        }

        process, err := os.FindProcess(childPid)
        if err != nil {
                return err
        }
        p.cmd.Process = process
        p.process.ops = p
        return nil
}

type mountSourceRequestFn func(*configs.Mount) (*mountSource, error)

// goCreateMountSources spawns a goroutine which creates open_tree(2)-style
// mountfds based on the requested configs.Mount configuration. The returned
// requestFn and cancelFn are used to interact with the goroutine.
//
// The caller of the returned mountSourceRequestFn is responsible for closing
// the returned file.
func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
        type response struct {
                src *mountSource
                err error
        }

        errCh := make(chan error, 1)
        requestCh := make(chan *configs.Mount)
        responseCh := make(chan response)

        ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute)
        go func() {
                // We lock this thread because we need to setns(2) here. There is no
                // UnlockOSThread() here, to ensure that the Go runtime will kill this
                // thread once this goroutine returns (ensuring no other goroutines run
                // in this context).
                runtime.LockOSThread()

                // Detach from the shared fs of the rest of the Go process in order to
                // be able to CLONE_NEWNS.
                if err := unix.Unshare(unix.CLONE_FS); err != nil {
                        err = os.NewSyscallError("unshare(CLONE_FS)", err)
                        errCh <- fmt.Errorf("mount source thread: %w", err)
                        return
                }

                // Attach to the container's mount namespace.
                nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
                if err != nil {
                        errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
                        return
                }
                defer nsFd.Close()
                if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
                        err = os.NewSyscallError("setns", err)
                        errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
                        return
                }

                // No errors during setup!
                close(errCh)
                logrus.Debugf("mount source thread: successfully running in container mntns")

                nsHandles := new(userns.Handles)
                defer nsHandles.Release()
        loop:
                for {
                        select {
                        case m, ok := <-requestCh:
                                if !ok {
                                        break loop
                                }
                                src, err := mountFd(nsHandles, m)
                                logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
                                responseCh <- response{
                                        src: src,
                                        err: err,
                                }
                        case <-ctx.Done():
                                break loop
                        }
                }
                logrus.Debugf("mount source thread: closing thread: %v", ctx.Err())
                close(responseCh)
        }()

        // Check for setup errors.
        err := <-errCh
        if err != nil {
                cancelFn()
                return nil, nil, err
        }

        // TODO: Switch to context.AfterFunc when we switch to Go 1.21.
        var requestChCloseOnce sync.Once
        requestFn := func(m *configs.Mount) (*mountSource, error) {
                var err error
                select {
                case requestCh <- m:
                        select {
                        case resp, ok := <-responseCh:
                                if ok {
                                        return resp.src, resp.err
                                }
                        case <-ctx.Done():
                                err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err())
                        }
                case <-ctx.Done():
                        err = fmt.Errorf("send mount request cancelled: %w", ctx.Err())
                }
                requestChCloseOnce.Do(func() { close(requestCh) })
                return nil, err
        }
        return requestFn, cancelFn, nil
}

func (p *initProcess) start() (retErr error) {
        defer p.comm.closeParent()
        err := p.cmd.Start()
        p.process.ops = p
        // close the child-side of the pipes (controlled by child)
        p.comm.closeChild()
        if err != nil {
                p.process.ops = nil
                return fmt.Errorf("unable to start init: %w", err)
        }

        defer func() {
                if retErr != nil {
                        // Find out if init is killed by the kernel's OOM killer.
                        // Get the count before killing init as otherwise cgroup
                        // might be removed by systemd.
                        oom, err := p.manager.OOMKillCount()
                        if err != nil {
                                logrus.WithError(err).Warn("unable to get oom kill count")
                        } else if oom > 0 {
                                // Does not matter what the particular error was,
                                // its cause is most probably OOM, so report that.
                                const oomError = "container init was OOM-killed (memory limit too low?)"

                                if logrus.GetLevel() >= logrus.DebugLevel {
                                        // Only show the original error if debug is set,
                                        // as it is not generally very useful.
                                        retErr = fmt.Errorf(oomError+": %w", retErr)
                                } else {
                                        retErr = errors.New(oomError)
                                }
                        }

                        // Terminate the process to ensure we can remove cgroups.
                        if err := ignoreTerminateErrors(p.terminate()); err != nil {
                                logrus.WithError(err).Warn("unable to terminate initProcess")
                        }

                        _ = p.manager.Destroy()
                        if p.intelRdtManager != nil {
                                _ = p.intelRdtManager.Destroy()
                        }
                }
        }()

        // Do this before syncing with child so that no children can escape the
        // cgroup. We don't need to worry about not doing this and not being root
        // because we'd be using the rootless cgroup manager in that case.
        if err := p.manager.Apply(p.pid()); err != nil {
                if errors.Is(err, cgroups.ErrRootless) {
                        // ErrRootless is to be ignored except when
                        // the container doesn't have private pidns.
                        if !p.config.Config.Namespaces.IsPrivate(configs.NEWPID) {
                                // TODO: make this an error in runc 1.3.
                                logrus.Warn("Creating a rootless container with no cgroup and no private pid namespace. " +
                                        "Such configuration is strongly discouraged (as it is impossible to properly kill all container's processes) " +
                                        "and will result in an error in a future runc version.")
                        }
                } else {
                        return fmt.Errorf("unable to apply cgroup configuration: %w", err)
                }
        }
        if p.intelRdtManager != nil {
                if err := p.intelRdtManager.Apply(p.pid()); err != nil {
                        return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
                }
        }
        if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
                return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
        }

        childPid, err := p.getChildPid()
        if err != nil {
                return fmt.Errorf("can't get final child's PID from pipe: %w", err)
        }

        // Save the standard descriptor names before the container process
        // can potentially move them (e.g., via dup2()).  If we don't do this now,
        // we won't know at checkpoint time which file descriptor to look up.
        fds, err := getPipeFds(childPid)
        if err != nil {
                return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
        }
        p.setExternalDescriptors(fds)

        // Wait for our first child to exit
        if err := p.waitForChildExit(childPid); err != nil {
                return fmt.Errorf("error waiting for our first child to exit: %w", err)
        }

        // Spin up a goroutine to handle remapping mount requests by runc init.
        // There is no point doing this for rootless containers because they cannot
        // configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just
        // service plain-open requests for plain bind-mounts but there's no need
        // (rootless containers will never have permission issues on a source mount
        // that the parent process can help with -- they are the same user).
        var mountRequest mountSourceRequestFn
        if !p.container.config.RootlessEUID {
                request, cancel, err := p.goCreateMountSources(context.Background())
                if err != nil {
                        return fmt.Errorf("error spawning mount remapping thread: %w", err)
                }
                defer cancel()
                mountRequest = request
        }

        if err := p.createNetworkInterfaces(); err != nil {
                return fmt.Errorf("error creating network interfaces: %w", err)
        }

        // initConfig.SpecState is only needed to run hooks that are executed
        // inside a container, i.e. CreateContainer and StartContainer.
        if p.config.Config.HasHook(configs.CreateContainer, configs.StartContainer) {
                p.config.SpecState, err = p.container.currentOCIState()
                if err != nil {
                        return fmt.Errorf("error getting current state: %w", err)
                }
        }

        if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
                return fmt.Errorf("error sending config to init process: %w", err)
        }

        var seenProcReady bool
        ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
                switch sync.Type {
                case procMountPlease:
                        if mountRequest == nil {
                                return fmt.Errorf("cannot fulfil mount requests as a rootless user")
                        }
                        var m *configs.Mount
                        if sync.Arg == nil {
                                return fmt.Errorf("sync %q is missing an argument", sync.Type)
                        }
                        if err := json.Unmarshal(*sync.Arg, &m); err != nil {
                                return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
                        }
                        mnt, err := mountRequest(m)
                        if err != nil {
                                return fmt.Errorf("failed to fulfil mount request: %w", err)
                        }
                        defer mnt.file.Close()

                        arg, err := json.Marshal(mnt)
                        if err != nil {
                                return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
                        }
                        argMsg := json.RawMessage(arg)
                        if err := doWriteSync(p.comm.syncSockParent, syncT{
                                Type: procMountFd,
                                Arg:  &argMsg,
                                File: mnt.file,
                        }); err != nil {
                                return err
                        }
                case procSeccomp:
                        if p.config.Config.Seccomp.ListenerPath == "" {
                                return errors.New("seccomp listenerPath is not set")
                        }
                        var srcFd int
                        if sync.Arg == nil {
                                return fmt.Errorf("sync %q is missing an argument", sync.Type)
                        }
                        if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
                                return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
                        }
                        seccompFd, err := pidGetFd(p.pid(), srcFd)
                        if err != nil {
                                return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
                        }
                        defer seccompFd.Close()
                        // We have a copy, the child can keep working. We don't need to
                        // wait for the seccomp notify listener to get the fd before we
                        // permit the child to continue because the child will happily wait
                        // for the listener if it hits SCMP_ACT_NOTIFY.
                        if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil {
                                return err
                        }

                        s, err := p.container.currentOCIState()
                        if err != nil {
                                return err
                        }

                        // initProcessStartTime hasn't been set yet.
                        s.Pid = p.cmd.Process.Pid
                        s.Status = specs.StateCreating
                        containerProcessState := &specs.ContainerProcessState{
                                Version:  specs.Version,
                                Fds:      []string{specs.SeccompFdName},
                                Pid:      s.Pid,
                                Metadata: p.config.Config.Seccomp.ListenerMetadata,
                                State:    *s,
                        }
                        if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
                                containerProcessState, seccompFd); err != nil {
                                return err
                        }
                case procReady:
                        seenProcReady = true
                        // Set rlimits, this has to be done here because we lose permissions
                        // to raise the limits once we enter a user-namespace
                        if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
                                return fmt.Errorf("error setting rlimits for ready process: %w", err)
                        }

                        // generate a timestamp indicating when the container was started
                        p.container.created = time.Now().UTC()
                        p.container.state = &createdState{
                                c: p.container,
                        }

                        // NOTE: If the procRun state has been synced and the
                        // runc-create process has been killed for some reason,
                        // the runc-init[2:stage] process will be leaky. And
                        // the runc command also fails to parse root directory
                        // because the container doesn't have state.json.
                        //
                        // In order to cleanup the runc-init[2:stage] by
                        // runc-delete/stop, we should store the status before
                        // procRun sync.
                        state, uerr := p.container.updateState(p)
                        if uerr != nil {
                                return fmt.Errorf("unable to store init state: %w", uerr)
                        }
                        p.container.initProcessStartTime = state.InitProcessStartTime

                        // Sync with child.
                        if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
                                return err
                        }
                case procHooks:
                        // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
                        if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
                                return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
                        }
                        if p.intelRdtManager != nil {
                                if err := p.intelRdtManager.Set(p.config.Config); err != nil {
                                        return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
                                }
                        }
                        if p.config.Config.HasHook(configs.Prestart, configs.CreateRuntime) {
                                s, err := p.container.currentOCIState()
                                if err != nil {
                                        return err
                                }
                                // initProcessStartTime hasn't been set yet.
                                s.Pid = p.cmd.Process.Pid
                                s.Status = specs.StateCreating
                                hooks := p.config.Config.Hooks

                                if err := hooks.Run(configs.Prestart, s); err != nil {
                                        return err
                                }
                                if err := hooks.Run(configs.CreateRuntime, s); err != nil {
                                        return err
                                }
                        }
                        // Sync with child.
                        if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil {
                                return err
                        }
                default:
                        return errors.New("invalid JSON payload from child")
                }
                return nil
        })

        if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
                return err
        }
        if !seenProcReady && ierr == nil {
                ierr = errors.New("procReady not received")
        }
        if ierr != nil {
                return fmt.Errorf("error during container init: %w", ierr)
        }
        return nil
}

func (p *initProcess) createNetworkInterfaces() error {
        for _, config := range p.config.Config.Networks {
                strategy, err := getStrategy(config.Type)
                if err != nil {
                        return err
                }
                n := &network{
                        Network: *config,
                }
                if err := strategy.create(n, p.pid()); err != nil {
                        return err
                }
                p.config.Networks = append(p.config.Networks, n)
        }
        return nil
}

func pidGetFd(pid, srcFd int) (*os.File, error) {
        pidFd, err := unix.PidfdOpen(pid, 0)
        if err != nil {
                return nil, os.NewSyscallError("pidfd_open", err)
        }
        defer unix.Close(pidFd)
        fd, err := unix.PidfdGetfd(pidFd, srcFd, 0)
        if err != nil {
                return nil, os.NewSyscallError("pidfd_getfd", err)
        }
        return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil
}

func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error {
        conn, err := net.Dial("unix", listenerPath)
        if err != nil {
                return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
        }

        socket, err := conn.(*net.UnixConn).File()
        if err != nil {
                return fmt.Errorf("cannot get seccomp socket: %w", err)
        }
        defer socket.Close()

        b, err := json.Marshal(state)
        if err != nil {
                return fmt.Errorf("cannot marshall seccomp state: %w", err)
        }

        if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil {
                return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
        }
        runtime.KeepAlive(file)
        return nil
}

func getPipeFds(pid int) ([]string, error) {
        fds := make([]string, 3)

        dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
        for i := 0; i < 3; i++ {
                // XXX: This breaks if the path is not a valid symlink (which can
                //      happen in certain particularly unlucky mount namespace setups).
                f := filepath.Join(dirPath, strconv.Itoa(i))
                target, err := os.Readlink(f)
                if err != nil {
                        // Ignore permission errors, for rootless containers and other
                        // non-dumpable processes. if we can't get the fd for a particular
                        // file, there's not much we can do.
                        if os.IsPermission(err) {
                                continue
                        }
                        return fds, err
                }
                fds[i] = target
        }
        return fds, nil
}

// InitializeIO creates pipes for use with the process's stdio and returns the
// opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
        var fds []uintptr
        i = &IO{}
        // cleanup in case of an error
        defer func() {
                if err != nil {
                        for _, fd := range fds {
                                _ = unix.Close(int(fd))
                        }
                }
        }()
        // STDIN
        r, w, err := os.Pipe()
        if err != nil {
                return nil, err
        }
        fds = append(fds, r.Fd(), w.Fd())
        p.Stdin, i.Stdin = r, w
        // STDOUT
        if r, w, err = os.Pipe(); err != nil {
                return nil, err
        }
        fds = append(fds, r.Fd(), w.Fd())
        p.Stdout, i.Stdout = w, r
        // STDERR
        if r, w, err = os.Pipe(); err != nil {
                return nil, err
        }
        fds = append(fds, r.Fd(), w.Fd())
        p.Stderr, i.Stderr = w, r
        // change ownership of the pipes in case we are in a user namespace
        for _, fd := range fds {
                if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
                        return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
                }
        }
        return i, nil
}

package libcontainer

import (
        "errors"
        "os"
        "os/exec"

        "github.com/opencontainers/runc/libcontainer/system"
)

func newRestoredProcess(cmd *exec.Cmd, fds []string) (*restoredProcess, error) {
        var err error
        pid := cmd.Process.Pid
        stat, err := system.Stat(pid)
        if err != nil {
                return nil, err
        }
        return &restoredProcess{
                cmd:              cmd,
                processStartTime: stat.StartTime,
                fds:              fds,
        }, nil
}

type restoredProcess struct {
        cmd              *exec.Cmd
        processStartTime uint64
        fds              []string
}

func (p *restoredProcess) start() error {
        return errors.New("restored process cannot be started")
}

func (p *restoredProcess) pid() int {
        return p.cmd.Process.Pid
}

func (p *restoredProcess) terminate() error {
        err := p.cmd.Process.Kill()
        if _, werr := p.wait(); err == nil {
                err = werr
        }
        return err
}

func (p *restoredProcess) wait() (*os.ProcessState, error) {
        // TODO: how do we wait on the actual process?
        // maybe use --exec-cmd in criu
        err := p.cmd.Wait()
        if err != nil {
                var exitErr *exec.ExitError
                if !errors.As(err, &exitErr) {
                        return nil, err
                }
        }
        st := p.cmd.ProcessState
        return st, nil
}

func (p *restoredProcess) startTime() (uint64, error) {
        return p.processStartTime, nil
}

func (p *restoredProcess) signal(s os.Signal) error {
        return p.cmd.Process.Signal(s)
}

func (p *restoredProcess) externalDescriptors() []string {
        return p.fds
}

func (p *restoredProcess) setExternalDescriptors(newFds []string) {
        p.fds = newFds
}

func (p *restoredProcess) forwardChildLogs() chan error {
        return nil
}

// nonChildProcess represents a process where the calling process is not
// the parent process. This process is created when Load loads a container
// from a persisted state.
type nonChildProcess struct {
        processPid       int
        processStartTime uint64
        fds              []string
}

func (p *nonChildProcess) start() error {
        return errors.New("restored process cannot be started")
}

func (p *nonChildProcess) pid() int {
        return p.processPid
}

func (p *nonChildProcess) terminate() error {
        return errors.New("restored process cannot be terminated")
}

func (p *nonChildProcess) wait() (*os.ProcessState, error) {
        return nil, errors.New("restored process cannot be waited on")
}

func (p *nonChildProcess) startTime() (uint64, error) {
        return p.processStartTime, nil
}

func (p *nonChildProcess) signal(s os.Signal) error {
        proc, err := os.FindProcess(p.processPid)
        if err != nil {
                return err
        }
        return proc.Signal(s)
}

func (p *nonChildProcess) externalDescriptors() []string {
        return p.fds
}

func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
        p.fds = newFds
}

func (p *nonChildProcess) forwardChildLogs() chan error {
        return nil
}

package libcontainer

import (
        "encoding/json"
        "errors"
        "fmt"
        "os"
        "path"
        "path/filepath"
        "strconv"
        "strings"
        "syscall"
        "time"

        securejoin "github.com/cyphar/filepath-securejoin"
        "github.com/moby/sys/mountinfo"
        "github.com/moby/sys/userns"
        "github.com/mrunalp/fileutils"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/selinux/go-selinux/label"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
        "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/utils"
)

const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV

// mountConfig contains mount data not specific to a mount point.
type mountConfig struct {
        root            string
        label           string
        cgroup2Path     string
        rootlessCgroups bool
        cgroupns        bool
}

// mountEntry contains mount data specific to a mount point.
type mountEntry struct {
        *configs.Mount
        srcFile *mountSource
}

// srcName is only meant for error messages, it returns a "friendly" name.
func (m mountEntry) srcName() string {
        if m.srcFile != nil {
                return m.srcFile.file.Name()
        }
        return m.Source
}

func (m mountEntry) srcStat() (os.FileInfo, *syscall.Stat_t, error) {
        var (
                st  os.FileInfo
                err error
        )
        if m.srcFile != nil {
                st, err = m.srcFile.file.Stat()
        } else {
                st, err = os.Stat(m.Source)
        }
        if err != nil {
                return nil, nil, err
        }
        return st, st.Sys().(*syscall.Stat_t), nil
}

func (m mountEntry) srcStatfs() (*unix.Statfs_t, error) {
        var st unix.Statfs_t
        if m.srcFile != nil {
                if err := unix.Fstatfs(int(m.srcFile.file.Fd()), &st); err != nil {
                        return nil, os.NewSyscallError("fstatfs", err)
                }
        } else {
                if err := unix.Statfs(m.Source, &st); err != nil {
                        return nil, &os.PathError{Op: "statfs", Path: m.Source, Err: err}
                }
        }
        return &st, nil
}

// needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool {
        for _, m := range config.Mounts {
                if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" {
                        return false
                }
        }
        return true
}

// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) {
        config := iConfig.Config
        if err := prepareRoot(config); err != nil {
                return fmt.Errorf("error preparing rootfs: %w", err)
        }

        mountConfig := &mountConfig{
                root:            config.Rootfs,
                label:           config.MountLabel,
                cgroup2Path:     iConfig.Cgroup2Path,
                rootlessCgroups: config.RootlessCgroups,
                cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
        }
        for _, m := range config.Mounts {
                entry := mountEntry{Mount: m}
                // Figure out whether we need to request runc to give us an
                // open_tree(2)-style mountfd. For idmapped mounts, this is always
                // necessary. For bind-mounts, this is only necessary if we cannot
                // resolve the parent mount (this is only hit if you are running in a
                // userns -- but for rootless the host-side thread can't help).
                wantSourceFile := m.IsIDMapped()
                if m.IsBind() && !config.RootlessEUID {
                        if _, err := os.Stat(m.Source); err != nil {
                                wantSourceFile = true
                        }
                }
                if wantSourceFile {
                        // Request a source file from the host.
                        if err := writeSyncArg(pipe, procMountPlease, m); err != nil {
                                return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err)
                        }
                        sync, err := readSyncFull(pipe, procMountFd)
                        if err != nil {
                                return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err)
                        }
                        if sync.File == nil {
                                return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source)
                        }
                        defer sync.File.Close()
                        // Sanity-check to make sure we didn't get the wrong fd back. Note
                        // that while m.Source might contain symlinks, the (*os.File).Name
                        // is based on the path provided to os.OpenFile, not what it
                        // resolves to. So this should never happen.
                        if sync.File.Name() != m.Source {
                                return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name())
                        }
                        // Unmarshal the procMountFd argument (the file is sync.File).
                        var src *mountSource
                        if sync.Arg == nil {
                                return fmt.Errorf("sync %q is missing an argument", sync.Type)
                        }
                        if err := json.Unmarshal(*sync.Arg, &src); err != nil {
                                return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err)
                        }
                        if src == nil {
                                return fmt.Errorf("mountfd request for %q: no mount source info received", m.Source)
                        }
                        src.file = sync.File
                        entry.srcFile = src
                }
                if err := mountToRootfs(mountConfig, entry); err != nil {
                        return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
                }
        }

        setupDev := needsSetupDev(config)
        if setupDev {
                if err := createDevices(config); err != nil {
                        return fmt.Errorf("error creating device nodes: %w", err)
                }
                if err := setupPtmx(config); err != nil {
                        return fmt.Errorf("error setting up ptmx: %w", err)
                }
                if err := setupDevSymlinks(config.Rootfs); err != nil {
                        return fmt.Errorf("error setting up /dev symlinks: %w", err)
                }
        }

        // Signal the parent to run the pre-start hooks.
        // The hooks are run after the mounts are setup, but before we switch to the new
        // root, so that the old root is still available in the hooks for any mount
        // manipulations.
        // Note that iConfig.Cwd is not guaranteed to exist here.
        if err := syncParentHooks(pipe); err != nil {
                return err
        }

        // The reason these operations are done here rather than in finalizeRootfs
        // is because the console-handling code gets quite sticky if we have to set
        // up the console before doing the pivot_root(2). This is because the
        // Console API has to also work with the ExecIn case, which means that the
        // API must be able to deal with being inside as well as outside the
        // container. It's just cleaner to do this here (at the expense of the
        // operation not being perfectly split).

        if err := unix.Chdir(config.Rootfs); err != nil {
                return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err}
        }

        if s := iConfig.SpecState; s != nil {
                s.Pid = unix.Getpid()
                s.Status = specs.StateCreating
                if err := iConfig.Config.Hooks.Run(configs.CreateContainer, s); err != nil {
                        return err
                }
        }

        if config.NoPivotRoot {
                err = msMoveRoot(config.Rootfs)
        } else if config.Namespaces.Contains(configs.NEWNS) {
                err = pivotRoot(config.Rootfs)
        } else {
                err = chroot()
        }
        if err != nil {
                return fmt.Errorf("error jailing process inside rootfs: %w", err)
        }

        if setupDev {
                if err := reOpenDevNull(); err != nil {
                        return fmt.Errorf("error reopening /dev/null inside container: %w", err)
                }
        }

        if cwd := iConfig.Cwd; cwd != "" {
                // Note that spec.Process.Cwd can contain unclean value like  "../../../../foo/bar...".
                // However, we are safe to call MkDirAll directly because we are in the jail here.
                if err := os.MkdirAll(cwd, 0o755); err != nil {
                        return err
                }
        }

        return nil
}

// finalizeRootfs sets anything to ro if necessary. You must call
// prepareRootfs first.
func finalizeRootfs(config *configs.Config) (err error) {
        // All tmpfs mounts and /dev were previously mounted as rw
        // by mountPropagate. Remount them read-only as requested.
        for _, m := range config.Mounts {
                if m.Flags&unix.MS_RDONLY != unix.MS_RDONLY {
                        continue
                }
                if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
                        if err := remountReadonly(m); err != nil {
                                return err
                        }
                }
        }

        // set rootfs ( / ) as readonly
        if config.Readonlyfs {
                if err := setReadonly(); err != nil {
                        return fmt.Errorf("error setting rootfs as readonly: %w", err)
                }
        }

        if config.Umask != nil {
                unix.Umask(int(*config.Umask))
        } else {
                unix.Umask(0o022)
        }
        return nil
}

// /tmp has to be mounted as private to allow MS_MOVE to work in all situations
func prepareTmp(topTmpDir string) (string, error) {
        tmpdir, err := os.MkdirTemp(topTmpDir, "runctop")
        if err != nil {
                return "", err
        }
        if err := mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
                return "", err
        }
        if err := mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
                return "", err
        }
        return tmpdir, nil
}

func cleanupTmp(tmpdir string) {
        _ = unix.Unmount(tmpdir, 0)
        _ = os.RemoveAll(tmpdir)
}

func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
        binds, err := getCgroupMounts(m)
        if err != nil {
                return err
        }
        var merged []string
        for _, b := range binds {
                ss := filepath.Base(b.Destination)
                if strings.Contains(ss, ",") {
                        merged = append(merged, ss)
                }
        }
        tmpfs := &configs.Mount{
                Source:           "tmpfs",
                Device:           "tmpfs",
                Destination:      m.Destination,
                Flags:            defaultMountFlags,
                Data:             "mode=755",
                PropagationFlags: m.PropagationFlags,
        }

        if err := mountToRootfs(c, mountEntry{Mount: tmpfs}); err != nil {
                return err
        }

        for _, b := range binds {
                if c.cgroupns {
                        // We just created the tmpfs, and so we can just use filepath.Join
                        // here (not to mention we want to make sure we create the path
                        // inside the tmpfs, so we don't want to resolve symlinks).
                        subsystemPath := filepath.Join(c.root, b.Destination)
                        subsystemName := filepath.Base(b.Destination)
                        if err := utils.MkdirAllInRoot(c.root, subsystemPath, 0o755); err != nil {
                                return err
                        }
                        if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error {
                                flags := defaultMountFlags
                                if m.Flags&unix.MS_RDONLY != 0 {
                                        flags = flags | unix.MS_RDONLY
                                }
                                var (
                                        source = "cgroup"
                                        data   = subsystemName
                                )
                                if data == "systemd" {
                                        data = cgroups.CgroupNamePrefix + data
                                        source = "systemd"
                                }
                                return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data)
                        }); err != nil {
                                return err
                        }
                } else {
                        if err := mountToRootfs(c, mountEntry{Mount: b}); err != nil {
                                return err
                        }
                }
        }
        for _, mc := range merged {
                for _, ss := range strings.Split(mc, ",") {
                        // symlink(2) is very dumb, it will just shove the path into
                        // the link and doesn't do any checks or relative path
                        // conversion. Also, don't error out if the cgroup already exists.
                        if err := os.Symlink(mc, filepath.Join(c.root, m.Destination, ss)); err != nil && !os.IsExist(err) {
                                return err
                        }
                }
        }
        return nil
}

func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
        err := utils.WithProcfd(c.root, m.Destination, func(dstFd string) error {
                return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data)
        })
        if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
                return err
        }

        // When we are in UserNS but CgroupNS is not unshared, we cannot mount
        // cgroup2 (#2158), so fall back to bind mount.
        bindM := &configs.Mount{
                Device:           "bind",
                Source:           fs2.UnifiedMountpoint,
                Destination:      m.Destination,
                Flags:            unix.MS_BIND | m.Flags,
                PropagationFlags: m.PropagationFlags,
        }
        if c.cgroupns && c.cgroup2Path != "" {
                // Emulate cgroupns by bind-mounting the container cgroup path
                // rather than the whole /sys/fs/cgroup.
                bindM.Source = c.cgroup2Path
        }
        // mountToRootfs() handles remounting for MS_RDONLY.
        err = mountToRootfs(c, mountEntry{Mount: bindM})
        if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
                // ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
                // outside the userns+mountns.
                //
                // Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted
                // with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`).
                err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
                        return maskPath(procfd, c.label)
                })
        }
        return err
}

func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
        // Set up a scratch dir for the tmpfs on the host.
        tmpdir, err := prepareTmp("/tmp")
        if err != nil {
                return fmt.Errorf("tmpcopyup: failed to setup tmpdir: %w", err)
        }
        defer cleanupTmp(tmpdir)
        tmpDir, err := os.MkdirTemp(tmpdir, "runctmpdir")
        if err != nil {
                return fmt.Errorf("tmpcopyup: failed to create tmpdir: %w", err)
        }
        defer os.RemoveAll(tmpDir)

        // Configure the *host* tmpdir as if it's the container mount. We change
        // m.Destination since we are going to mount *on the host*.
        oldDest := m.Destination
        m.Destination = tmpDir
        err = mountPropagate(m, "/", mountLabel)
        m.Destination = oldDest
        if err != nil {
                return err
        }
        defer func() {
                if Err != nil {
                        if err := unmount(tmpDir, unix.MNT_DETACH); err != nil {
                                logrus.Warnf("tmpcopyup: %v", err)
                        }
                }
        }()

        return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) {
                // Copy the container data to the host tmpdir. We append "/" to force
                // CopyDirectory to resolve the symlink rather than trying to copy the
                // symlink itself.
                if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil {
                        return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err)
                }
                // Now move the mount into the container.
                if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil {
                        return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
                }
                return nil
        })
}

const (
        // The atime "enum" flags (which are mutually exclusive).
        mntAtimeEnumFlags = unix.MS_NOATIME | unix.MS_RELATIME | unix.MS_STRICTATIME
        // All atime-related flags.
        mntAtimeFlags = mntAtimeEnumFlags | unix.MS_NODIRATIME
        // Flags which can be locked when inheriting mounts in a different userns.
        // In the kernel, these are the mounts that are locked using MNT_LOCK_*.
        mntLockFlags = unix.MS_RDONLY | unix.MS_NODEV | unix.MS_NOEXEC |
                unix.MS_NOSUID | mntAtimeFlags
)

func statfsToMountFlags(st unix.Statfs_t) int {
        // From <linux/statfs.h>.
        const ST_NOSYMFOLLOW = 0x2000 //nolint:revive

        var flags int
        for _, f := range []struct {
                st, ms int
        }{
                // See calculate_f_flags() in fs/statfs.c.
                {unix.ST_RDONLY, unix.MS_RDONLY},
                {unix.ST_NOSUID, unix.MS_NOSUID},
                {unix.ST_NODEV, unix.MS_NODEV},
                {unix.ST_NOEXEC, unix.MS_NOEXEC},
                {unix.ST_MANDLOCK, unix.MS_MANDLOCK},
                {unix.ST_SYNCHRONOUS, unix.MS_SYNCHRONOUS},
                {unix.ST_NOATIME, unix.MS_NOATIME},
                {unix.ST_NODIRATIME, unix.MS_NODIRATIME},
                {unix.ST_RELATIME, unix.MS_RELATIME},
                {ST_NOSYMFOLLOW, unix.MS_NOSYMFOLLOW},
                // There is no ST_STRICTATIME -- see below.
        } {
                if int(st.Flags)&f.st == f.st {
                        flags |= f.ms
                }
        }
        // MS_STRICTATIME is a "fake" MS_* flag. It isn't stored in mnt->mnt_flags,
        // and so it doesn't show up in statfs(2). If none of the other flags in
        // atime enum are present, the mount is MS_STRICTATIME.
        if flags&mntAtimeEnumFlags == 0 {
                flags |= unix.MS_STRICTATIME
        }
        return flags
}

var errRootfsToFile = errors.New("config tries to change rootfs to file")

func createMountpoint(rootfs string, m mountEntry) (string, error) {
        dest, err := securejoin.SecureJoin(rootfs, m.Destination)
        if err != nil {
                return "", err
        }
        if err := checkProcMount(rootfs, dest, m); err != nil {
                return "", fmt.Errorf("check proc-safety of %s mount: %w", m.Destination, err)
        }

        switch m.Device {
        case "bind":
                fi, _, err := m.srcStat()
                if err != nil {
                        // Error out if the source of a bind mount does not exist as we
                        // will be unable to bind anything to it.
                        return "", err
                }
                // If the original source is not a directory, make the target a file.
                if !fi.IsDir() {
                        // Make sure we aren't tricked into trying to make the root a file.
                        if rootfs == dest {
                                return "", fmt.Errorf("%w: file bind mount over rootfs", errRootfsToFile)
                        }
                        // Make the parent directory.
                        destDir, destBase := filepath.Split(dest)
                        destDirFd, err := utils.MkdirAllInRootOpen(rootfs, destDir, 0o755)
                        if err != nil {
                                return "", fmt.Errorf("make parent dir of file bind-mount: %w", err)
                        }
                        defer destDirFd.Close()
                        // Make the target file. We want to avoid opening any file that is
                        // already there because it could be a "bad" file like an invalid
                        // device or hung tty that might cause a DoS, so we use mknodat.
                        // destBase does not contain any "/" components, and mknodat does
                        // not follow trailing symlinks, so we can safely just call mknodat
                        // here.
                        if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|0o644, 0); err != nil {
                                // If we get EEXIST, there was already an inode there and
                                // we can consider that a success.
                                if !errors.Is(err, unix.EEXIST) {
                                        err = &os.PathError{Op: "mknod regular file", Path: dest, Err: err}
                                        return "", fmt.Errorf("create target of file bind-mount: %w", err)
                                }
                        }
                        // Nothing left to do.
                        return dest, nil
                }

        case "tmpfs":
                // If the original target exists, copy the mode for the tmpfs mount.
                if stat, err := os.Stat(dest); err == nil {
                        dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
                        if m.Data != "" {
                                dt = dt + "," + m.Data
                        }
                        m.Data = dt

                        // Nothing left to do.
                        return dest, nil
                }
        }

        if err := utils.MkdirAllInRoot(rootfs, dest, 0o755); err != nil {
                return "", err
        }
        return dest, nil
}

func mountToRootfs(c *mountConfig, m mountEntry) error {
        rootfs := c.root

        // procfs and sysfs are special because we need to ensure they are actually
        // mounted on a specific path in a container without any funny business.
        switch m.Device {
        case "proc", "sysfs":
                // If the destination already exists and is not a directory, we bail
                // out. This is to avoid mounting through a symlink or similar -- which
                // has been a "fun" attack scenario in the past.
                // TODO: This won't be necessary once we switch to libpathrs and we can
                //       stop all of these symlink-exchange attacks.
                dest := filepath.Clean(m.Destination)
                if !utils.IsLexicallyInRoot(rootfs, dest) {
                        // Do not use securejoin as it resolves symlinks.
                        dest = filepath.Join(rootfs, dest)
                }
                if err := checkProcMount(rootfs, dest, m); err != nil {
                        return err
                }
                if fi, err := os.Lstat(dest); err != nil {
                        if !os.IsNotExist(err) {
                                return err
                        }
                } else if !fi.IsDir() {
                        return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
                }
                if err := utils.MkdirAllInRoot(rootfs, dest, 0o755); err != nil {
                        return err
                }
                // Selinux kernels do not support labeling of /proc or /sys.
                return mountPropagate(m, rootfs, "")
        }

        dest, err := createMountpoint(rootfs, m)
        if err != nil {
                return fmt.Errorf("create mountpoint for %s mount: %w", m.Destination, err)
        }
        mountLabel := c.label

        switch m.Device {
        case "mqueue":
                if err := mountPropagate(m, rootfs, ""); err != nil {
                        return err
                }
                return label.SetFileLabel(dest, mountLabel)
        case "tmpfs":
                if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
                        err = doTmpfsCopyUp(m, rootfs, mountLabel)
                } else {
                        err = mountPropagate(m, rootfs, mountLabel)
                }

                return err
        case "bind":
                // open_tree()-related shenanigans are all handled in mountViaFds.
                if err := mountPropagate(m, rootfs, mountLabel); err != nil {
                        return err
                }

                // The initial MS_BIND won't change the mount options, we need to do a
                // separate MS_BIND|MS_REMOUNT to apply the mount options. We skip
                // doing this if the user has not specified any mount flags at all
                // (including cleared flags) -- in which case we just keep the original
                // mount flags.
                //
                // Note that the fact we check whether any clearing flags are set is in
                // contrast to mount(8)'s current behaviour, but is what users probably
                // expect. See <https://github.com/util-linux/util-linux/issues/2433>.
                if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 {
                        if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
                                flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT
                                // The runtime-spec says we SHOULD map to the relevant mount(8)
                                // behaviour. However, it's not clear whether we want the
                                // "mount --bind -o ..." or "mount --bind -o remount,..."
                                // behaviour here -- both of which are somewhat broken[1].
                                //
                                // So, if the user has passed "remount" as a mount option, we
                                // implement the "mount --bind -o remount" behaviour, otherwise
                                // we implement the spiritual intent of the "mount --bind -o"
                                // behaviour, which should match what users expect. Maybe
                                // mount(8) will eventually implement this behaviour too..
                                //
                                // [1]: https://github.com/util-linux/util-linux/issues/2433

                                // Initially, we emulate "mount --bind -o ..." where we set
                                // only the requested flags (clearing any existing flags). The
                                // only difference from mount(8) is that we do this
                                // unconditionally, regardless of whether any set-me mount
                                // options have been requested.
                                //
                                // TODO: We are not doing any special handling of the atime
                                // flags here, which means that the mount will inherit the old
                                // atime flags if the user didn't explicitly request a
                                // different set of flags. This also has the mount(8) bug where
                                // "nodiratime,norelatime" will result in a
                                // "nodiratime,relatime" mount.
                                mountErr := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
                                if mountErr == nil {
                                        return nil
                                }

                                // If the mount failed, the mount may contain locked mount
                                // flags. In that case, we emulate "mount --bind -o
                                // remount,...", where we take the existing mount flags of the
                                // mount and apply the request flags (including clearing flags)
                                // on top. The main divergence we have from mount(8) here is
                                // that we handle atimes correctly to make sure we error out if
                                // we cannot fulfil the requested mount flags.

                                st, err := m.srcStatfs()
                                if err != nil {
                                        return err
                                }
                                srcFlags := statfsToMountFlags(*st)
                                // If the user explicitly request one of the locked flags *not*
                                // be set, we need to return an error to avoid producing mounts
                                // that don't match the user's request.
                                if srcFlags&m.ClearedFlags&mntLockFlags != 0 {
                                        return mountErr
                                }

                                // If an MS_*ATIME flag was requested, it must match the
                                // existing one. This handles two separate kernel bugs, and
                                // matches the logic of can_change_locked_flags() but without
                                // these bugs:
                                //
                                // * (2.6.30+) Since commit 613cbe3d4870 ("Don't set relatime
                                // when noatime is specified"), MS_RELATIME is ignored when
                                // MS_NOATIME is set. This means that us inheriting MS_NOATIME
                                // from a mount while requesting MS_RELATIME would *silently*
                                // produce an MS_NOATIME mount.
                                //
                                // * (2.6.30+) Since its introduction in commit d0adde574b84
                                // ("Add a strictatime mount option"), MS_STRICTATIME has
                                // caused any passed MS_RELATIME and MS_NOATIME flags to be
                                // ignored which results in us *silently* producing
                                // MS_STRICTATIME mounts even if the user requested MS_RELATIME
                                // or MS_NOATIME.
                                if m.Flags&mntAtimeFlags != 0 && m.Flags&mntAtimeFlags != srcFlags&mntAtimeFlags {
                                        return mountErr
                                }

                                // Retry the mount with the existing lockable mount flags
                                // applied.
                                flags |= srcFlags & mntLockFlags
                                mountErr = mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
                                logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr)
                                return mountErr
                        }); err != nil {
                                return err
                        }
                }

                if m.Relabel != "" {
                        if err := label.Validate(m.Relabel); err != nil {
                                return err
                        }
                        shared := label.IsShared(m.Relabel)
                        if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
                                return err
                        }
                }
                return setRecAttr(m.Mount, rootfs)
        case "cgroup":
                if cgroups.IsCgroup2UnifiedMode() {
                        return mountCgroupV2(m.Mount, c)
                }
                return mountCgroupV1(m.Mount, c)
        default:
                return mountPropagate(m, rootfs, mountLabel)
        }
}

func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
        mounts, err := cgroups.GetCgroupMounts(false)
        if err != nil {
                return nil, err
        }

        // We don't need to use /proc/thread-self here because runc always runs
        // with every thread in the same cgroup. This lets us avoid having to do
        // runtime.LockOSThread.
        cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
        if err != nil {
                return nil, err
        }

        var binds []*configs.Mount

        for _, mm := range mounts {
                dir, err := mm.GetOwnCgroup(cgroupPaths)
                if err != nil {
                        return nil, err
                }
                relDir, err := filepath.Rel(mm.Root, dir)
                if err != nil {
                        return nil, err
                }
                binds = append(binds, &configs.Mount{
                        Device:           "bind",
                        Source:           filepath.Join(mm.Mountpoint, relDir),
                        Destination:      filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
                        Flags:            unix.MS_BIND | unix.MS_REC | m.Flags,
                        PropagationFlags: m.PropagationFlags,
                })
        }

        return binds, nil
}

// Taken from <include/linux/proc_ns.h>. If a file is on a filesystem of type
// PROC_SUPER_MAGIC, we're guaranteed that only the root of the superblock will
// have this inode number.
const procRootIno = 1

// checkProcMount checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function.
//
// If m is nil, don't stat the filesystem.  This is used for restore of a checkpoint.
func checkProcMount(rootfs, dest string, m mountEntry) error {
        const procPath = "/proc"
        path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest)
        if err != nil {
                return err
        }
        // pass if the mount path is located outside of /proc
        if strings.HasPrefix(path, "..") {
                return nil
        }
        if path == "." {
                // Only allow bind-mounts on top of /proc, and only if the source is a
                // procfs mount.
                if m.IsBind() {
                        fsSt, err := m.srcStatfs()
                        if err != nil {
                                return err
                        }
                        if fsSt.Type == unix.PROC_SUPER_MAGIC {
                                if _, uSt, err := m.srcStat(); err != nil {
                                        return err
                                } else if uSt.Ino != procRootIno {
                                        // We cannot error out in this case, because we've
                                        // supported these kinds of mounts for a long time.
                                        // However, we would expect users to bind-mount the root of
                                        // a real procfs on top of /proc in the container. We might
                                        // want to block this in the future.
                                        logrus.Warnf("bind-mount %v (source %v) is of type procfs but is not the root of a procfs (inode %d). Future versions of runc might block this configuration -- please report an issue to <https://github.com/opencontainers/runc> if you see this warning.", dest, m.srcName(), uSt.Ino)
                                }
                                return nil
                        }
                } else if m.Device == "proc" {
                        // Fresh procfs-type mounts are always safe to mount on top of /proc.
                        return nil
                }
                return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest)
        }

        // Here dest is definitely under /proc. Do not allow those,
        // except for a few specific entries emulated by lxcfs.
        validProcMounts := []string{
                "/proc/cpuinfo",
                "/proc/diskstats",
                "/proc/meminfo",
                "/proc/stat",
                "/proc/swaps",
                "/proc/uptime",
                "/proc/loadavg",
                "/proc/slabinfo",
                "/proc/net/dev",
                "/proc/sys/kernel/ns_last_pid",
                "/proc/sys/crypto/fips_enabled",
        }
        for _, valid := range validProcMounts {
                path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
                if err != nil {
                        return err
                }
                if path == "." {
                        return nil
                }
        }

        return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest)
}

func setupDevSymlinks(rootfs string) error {
        // In theory, these should be links to /proc/thread-self, but systems
        // expect these to be /proc/self and this matches how most distributions
        // work.
        links := [][2]string{
                {"/proc/self/fd", "/dev/fd"},
                {"/proc/self/fd/0", "/dev/stdin"},
                {"/proc/self/fd/1", "/dev/stdout"},
                {"/proc/self/fd/2", "/dev/stderr"},
        }
        // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
        // in /dev if it exists in /proc.
        if _, err := os.Stat("/proc/kcore"); err == nil {
                links = append(links, [2]string{"/proc/kcore", "/dev/core"})
        }
        for _, link := range links {
                var (
                        src = link[0]
                        dst = filepath.Join(rootfs, link[1])
                )
                if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
                        return err
                }
        }
        return nil
}

// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs
// this method will make them point to `/dev/null` in this container's rootfs.  This
// needs to be called after we chroot/pivot into the container's rootfs so that any
// symlinks are resolved locally.
func reOpenDevNull() error {
        var stat, devNullStat unix.Stat_t
        file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
        if err != nil {
                return err
        }
        defer file.Close() //nolint: errcheck
        if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
                return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
        }
        for fd := 0; fd < 3; fd++ {
                if err := unix.Fstat(fd, &stat); err != nil {
                        return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(fd), Err: err}
                }
                if stat.Rdev == devNullStat.Rdev {
                        // Close and re-open the fd.
                        if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil {
                                return &os.PathError{
                                        Op:   "dup3",
                                        Path: "fd " + strconv.Itoa(int(file.Fd())),
                                        Err:  err,
                                }
                        }
                }
        }
        return nil
}

// Create the device nodes in the container.
func createDevices(config *configs.Config) error {
        useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
        for _, node := range config.Devices {

                // The /dev/ptmx device is setup by setupPtmx()
                if utils.CleanPath(node.Path) == "/dev/ptmx" {
                        continue
                }

                // containers running in a user namespace are not allowed to mknod
                // devices so we can just bind mount it from the host.
                if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
                        return err
                }
        }
        return nil
}

func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
        f, err := os.Create(dest)
        if err != nil && !os.IsExist(err) {
                return err
        }
        if f != nil {
                _ = f.Close()
        }
        return utils.WithProcfd(rootfs, dest, func(dstFd string) error {
                return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "")
        })
}

// Creates the device node in the rootfs of the container.
func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
        if node.Path == "" {
                // The node only exists for cgroup reasons, ignore it here.
                return nil
        }
        dest, err := securejoin.SecureJoin(rootfs, node.Path)
        if err != nil {
                return err
        }
        if dest == rootfs {
                return fmt.Errorf("%w: mknod over rootfs", errRootfsToFile)
        }
        if err := utils.MkdirAllInRoot(rootfs, filepath.Dir(dest), 0o755); err != nil {
                return err
        }
        if bind {
                return bindMountDeviceNode(rootfs, dest, node)
        }
        if err := mknodDevice(dest, node); err != nil {
                if errors.Is(err, os.ErrExist) {
                        return nil
                } else if errors.Is(err, os.ErrPermission) {
                        return bindMountDeviceNode(rootfs, dest, node)
                }
                return err
        }
        return nil
}

func mknodDevice(dest string, node *devices.Device) error {
        fileMode := node.FileMode
        switch node.Type {
        case devices.BlockDevice:
                fileMode |= unix.S_IFBLK
        case devices.CharDevice:
                fileMode |= unix.S_IFCHR
        case devices.FifoDevice:
                fileMode |= unix.S_IFIFO
        default:
                return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
        }
        dev, err := node.Mkdev()
        if err != nil {
                return err
        }
        if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil {
                return &os.PathError{Op: "mknod", Path: dest, Err: err}
        }
        // Ensure permission bits (can be different because of umask).
        if err := os.Chmod(dest, fileMode); err != nil {
                return err
        }
        return os.Chown(dest, int(node.Uid), int(node.Gid))
}

// rootfsParentMountPrivate ensures rootfs parent mount is private.
// This is needed for two reasons:
//   - pivot_root() will fail if parent mount is shared;
//   - when we bind mount rootfs, if its parent is not private, the new mount
//     will propagate (leak!) to parent namespace and we don't want that.
func rootfsParentMountPrivate(path string) error {
        var err error
        // Assuming path is absolute and clean (this is checked in
        // libcontainer/validate). Any error other than EINVAL means we failed,
        // and EINVAL means this is not a mount point, so traverse up until we
        // find one.
        for {
                err = unix.Mount("", path, "", unix.MS_PRIVATE, "")
                if err == nil {
                        return nil
                }
                if err != unix.EINVAL || path == "/" { //nolint:errorlint // unix errors are bare
                        break
                }
                path = filepath.Dir(path)
        }
        return &mountError{
                op:     "remount-private",
                target: path,
                flags:  unix.MS_PRIVATE,
                err:    err,
        }
}

func prepareRoot(config *configs.Config) error {
        flag := unix.MS_SLAVE | unix.MS_REC
        if config.RootPropagation != 0 {
                flag = config.RootPropagation
        }
        if err := mount("", "/", "", uintptr(flag), ""); err != nil {
                return err
        }

        if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
                return err
        }

        return mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "")
}

func setReadonly() error {
        flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY)

        err := mount("", "/", "", flags, "")
        if err == nil {
                return nil
        }
        var s unix.Statfs_t
        if err := unix.Statfs("/", &s); err != nil {
                return &os.PathError{Op: "statfs", Path: "/", Err: err}
        }
        flags |= uintptr(s.Flags)
        return mount("", "/", "", flags, "")
}

func setupPtmx(config *configs.Config) error {
        ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
        if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
                return err
        }
        if err := os.Symlink("pts/ptmx", ptmx); err != nil {
                return err
        }
        return nil
}

// pivotRoot will call pivot_root such that rootfs becomes the new root
// filesystem, and everything else is cleaned up.
func pivotRoot(rootfs string) error {
        // While the documentation may claim otherwise, pivot_root(".", ".") is
        // actually valid. What this results in is / being the new root but
        // /proc/self/cwd being the old root. Since we can play around with the cwd
        // with pivot_root this allows us to pivot without creating directories in
        // the rootfs. Shout-outs to the LXC developers for giving us this idea.

        oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
        if err != nil {
                return &os.PathError{Op: "open", Path: "/", Err: err}
        }
        defer unix.Close(oldroot) //nolint: errcheck

        newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
        if err != nil {
                return &os.PathError{Op: "open", Path: rootfs, Err: err}
        }
        defer unix.Close(newroot) //nolint: errcheck

        // Change to the new root so that the pivot_root actually acts on it.
        if err := unix.Fchdir(newroot); err != nil {
                return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
        }

        if err := unix.PivotRoot(".", "."); err != nil {
                return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
        }

        // Currently our "." is oldroot (according to the current kernel code).
        // However, purely for safety, we will fchdir(oldroot) since there isn't
        // really any guarantee from the kernel what /proc/self/cwd will be after a
        // pivot_root(2).

        if err := unix.Fchdir(oldroot); err != nil {
                return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
        }

        // Make oldroot rslave to make sure our unmounts don't propagate to the
        // host (and thus bork the machine). We don't use rprivate because this is
        // known to cause issues due to races where we still have a reference to a
        // mount while a process in the host namespace are trying to operate on
        // something they think has no mounts (devicemapper in particular).
        if err := mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
                return err
        }
        // Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
        if err := unmount(".", unix.MNT_DETACH); err != nil {
                return err
        }

        // Switch back to our shiny new root.
        if err := unix.Chdir("/"); err != nil {
                return &os.PathError{Op: "chdir", Path: "/", Err: err}
        }
        return nil
}

func msMoveRoot(rootfs string) error {
        // Before we move the root and chroot we have to mask all "full" sysfs and
        // procfs mounts which exist on the host. This is because while the kernel
        // has protections against mounting procfs if it has masks, when using
        // chroot(2) the *host* procfs mount is still reachable in the mount
        // namespace and the kernel permits procfs mounts inside --no-pivot
        // containers.
        //
        // Users shouldn't be using --no-pivot except in exceptional circumstances,
        // but to avoid such a trivial security flaw we apply a best-effort
        // protection here. The kernel only allows a mount of a pseudo-filesystem
        // like procfs or sysfs if there is a *full* mount (the root of the
        // filesystem is mounted) without any other locked mount points covering a
        // subtree of the mount.
        //
        // So we try to unmount (or mount tmpfs on top of) any mountpoint which is
        // a full mount of either sysfs or procfs (since those are the most
        // concerning filesystems to us).
        mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) {
                // Collect every sysfs and procfs filesystem, except for those which
                // are non-full mounts or are inside the rootfs of the container.
                if info.Root != "/" ||
                        (info.FSType != "proc" && info.FSType != "sysfs") ||
                        strings.HasPrefix(info.Mountpoint, rootfs) {
                        skip = true
                }
                return
        })
        if err != nil {
                return err
        }
        for _, info := range mountinfos {
                p := info.Mountpoint
                // Be sure umount events are not propagated to the host.
                if err := mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
                        if errors.Is(err, unix.ENOENT) {
                                // If the mountpoint doesn't exist that means that we've
                                // already blasted away some parent directory of the mountpoint
                                // and so we don't care about this error.
                                continue
                        }
                        return err
                }
                if err := unmount(p, unix.MNT_DETACH); err != nil {
                        if !errors.Is(err, unix.EINVAL) && !errors.Is(err, unix.EPERM) {
                                return err
                        } else {
                                // If we have not privileges for umounting (e.g. rootless), then
                                // cover the path.
                                if err := mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
                                        return err
                                }
                        }
                }
        }

        // Move the rootfs on top of "/" in our mount namespace.
        if err := mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
                return err
        }
        return chroot()
}

func chroot() error {
        if err := unix.Chroot("."); err != nil {
                return &os.PathError{Op: "chroot", Path: ".", Err: err}
        }
        if err := unix.Chdir("/"); err != nil {
                return &os.PathError{Op: "chdir", Path: "/", Err: err}
        }
        return nil
}

// readonlyPath will make a path read only.
func readonlyPath(path string) error {
        if err := mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
                if errors.Is(err, os.ErrNotExist) {
                        return nil
                }
                return err
        }

        var s unix.Statfs_t
        if err := unix.Statfs(path, &s); err != nil {
                return &os.PathError{Op: "statfs", Path: path, Err: err}
        }
        flags := uintptr(s.Flags) & (unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)

        if err := mount(path, path, "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
                return err
        }

        return nil
}

// remountReadonly will remount an existing mount point and ensure that it is read-only.
func remountReadonly(m *configs.Mount) error {
        var (
                dest  = m.Destination
                flags = m.Flags
        )
        for i := 0; i < 5; i++ {
                // There is a special case in the kernel for
                // MS_REMOUNT | MS_BIND, which allows us to change only the
                // flags even as an unprivileged user (i.e. user namespace)
                // assuming we don't drop any security related flags (nodev,
                // nosuid, etc.). So, let's use that case so that we can do
                // this re-mount without failing in a userns.
                flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
                if err := mount("", dest, "", uintptr(flags), ""); err != nil {
                        if errors.Is(err, unix.EBUSY) {
                                time.Sleep(100 * time.Millisecond)
                                continue
                        }
                        return err
                }
                return nil
        }
        return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
}

// maskPath masks the top of the specified path inside a container to avoid
// security issues from processes reading information from non-namespace aware
// mounts ( proc/kcore ).
// For files, maskPath bind mounts /dev/null over the top of the specified path.
// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
func maskPath(path string, mountLabel string) error {
        if err := mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) {
                if errors.Is(err, unix.ENOTDIR) {
                        return mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel))
                }
                return err
        }
        return nil
}

// writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
func writeSystemProperty(key, value string) error {
        keyPath := strings.Replace(key, ".", "/", -1)
        return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
}

// Do the mount operation followed by additional mounts required to take care
// of propagation flags. This will always be scoped inside the container rootfs.
func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {
        var (
                data  = label.FormatMountLabel(m.Data, mountLabel)
                flags = m.Flags
        )
        // Delay mounting the filesystem read-only if we need to do further
        // operations on it. We need to set up files in "/dev", and other tmpfs
        // mounts may need to be chmod-ed after mounting. These mounts will be
        // remounted ro later in finalizeRootfs(), if necessary.
        if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
                flags &= ^unix.MS_RDONLY
        }

        // Because the destination is inside a container path which might be
        // mutating underneath us, we verify that we are actually going to mount
        // inside the container with WithProcfd() -- mounting through a procfd
        // mounts on the target.
        if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
                return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data)
        }); err != nil {
                return err
        }
        // We have to apply mount propagation flags in a separate WithProcfd() call
        // because the previous call invalidates the passed procfd -- the mount
        // target needs to be re-opened.
        if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
                for _, pflag := range m.PropagationFlags {
                        if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil {
                                return err
                        }
                }
                return nil
        }); err != nil {
                return fmt.Errorf("change mount propagation through procfd: %w", err)
        }
        return nil
}

func setRecAttr(m *configs.Mount, rootfs string) error {
        if m.RecAttr == nil {
                return nil
        }
        return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
                return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr)
        })
}

package seccomp

import (
        "fmt"
        "sort"

        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// flagTsync is recognized but ignored by runc, and it is not defined
// in the runtime-spec.
const flagTsync = "SECCOMP_FILTER_FLAG_TSYNC"

var operators = map[string]configs.Operator{
        "SCMP_CMP_NE":        configs.NotEqualTo,
        "SCMP_CMP_LT":        configs.LessThan,
        "SCMP_CMP_LE":        configs.LessThanOrEqualTo,
        "SCMP_CMP_EQ":        configs.EqualTo,
        "SCMP_CMP_GE":        configs.GreaterThanOrEqualTo,
        "SCMP_CMP_GT":        configs.GreaterThan,
        "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
}

// KnownOperators returns the list of the known operations.
// Used by `runc features`.
func KnownOperators() []string {
        var res []string
        for k := range operators {
                res = append(res, k)
        }
        sort.Strings(res)
        return res
}

var actions = map[string]configs.Action{
        "SCMP_ACT_KILL":         configs.Kill,
        "SCMP_ACT_ERRNO":        configs.Errno,
        "SCMP_ACT_TRAP":         configs.Trap,
        "SCMP_ACT_ALLOW":        configs.Allow,
        "SCMP_ACT_TRACE":        configs.Trace,
        "SCMP_ACT_LOG":          configs.Log,
        "SCMP_ACT_NOTIFY":       configs.Notify,
        "SCMP_ACT_KILL_THREAD":  configs.KillThread,
        "SCMP_ACT_KILL_PROCESS": configs.KillProcess,
}

// KnownActions returns the list of the known actions.
// Used by `runc features`.
func KnownActions() []string {
        var res []string
        for k := range actions {
                res = append(res, k)
        }
        sort.Strings(res)
        return res
}

var archs = map[string]string{
        "SCMP_ARCH_X86":         "x86",
        "SCMP_ARCH_X86_64":      "amd64",
        "SCMP_ARCH_X32":         "x32",
        "SCMP_ARCH_ARM":         "arm",
        "SCMP_ARCH_AARCH64":     "arm64",
        "SCMP_ARCH_MIPS":        "mips",
        "SCMP_ARCH_MIPS64":      "mips64",
        "SCMP_ARCH_MIPS64N32":   "mips64n32",
        "SCMP_ARCH_MIPSEL":      "mipsel",
        "SCMP_ARCH_MIPSEL64":    "mipsel64",
        "SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
        "SCMP_ARCH_PPC":         "ppc",
        "SCMP_ARCH_PPC64":       "ppc64",
        "SCMP_ARCH_PPC64LE":     "ppc64le",
        "SCMP_ARCH_RISCV64":     "riscv64",
        "SCMP_ARCH_S390":        "s390",
        "SCMP_ARCH_S390X":       "s390x",
}

// KnownArchs returns the list of the known archs.
// Used by `runc features`.
func KnownArchs() []string {
        var res []string
        for k := range archs {
                res = append(res, k)
        }
        sort.Strings(res)
        return res
}

// ConvertStringToOperator converts a string into a Seccomp comparison operator.
// Comparison operators use the names they are assigned by Libseccomp's header.
// Attempting to convert a string that is not a valid operator results in an
// error.
func ConvertStringToOperator(in string) (configs.Operator, error) {
        if op, ok := operators[in]; ok {
                return op, nil
        }
        return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
}

// ConvertStringToAction converts a string into a Seccomp rule match action.
// Actions use the names they are assigned in Libseccomp's header.
// Attempting to convert a string that is not a valid action results in an
// error.
func ConvertStringToAction(in string) (configs.Action, error) {
        if act, ok := actions[in]; ok {
                return act, nil
        }
        return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
}

// ConvertStringToArch converts a string into a Seccomp comparison arch.
func ConvertStringToArch(in string) (string, error) {
        if arch, ok := archs[in]; ok {
                return arch, nil
        }
        return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
}

// List of flags known to this version of runc.
var flags = []string{
        flagTsync,
        string(specs.LinuxSeccompFlagSpecAllow),
        string(specs.LinuxSeccompFlagLog),
}

// KnownFlags returns the list of the known filter flags.
// Used by `runc features`.
func KnownFlags() []string {
        return flags
}

// SupportedFlags returns the list of the supported filter flags.
// This list may be a subset of one returned by KnownFlags due to
// some flags not supported by the current kernel and/or libseccomp.
// Used by `runc features`.
func SupportedFlags() []string {
        if !Enabled {
                return nil
        }

        var res []string
        for _, flag := range flags {
                if FlagSupported(specs.LinuxSeccompFlag(flag)) == nil {
                        res = append(res, flag)
                }
        }

        return res
}

//go:build !linux || !cgo || !seccomp

package seccomp

import (
        "errors"

        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runtime-spec/specs-go"
)

var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")

// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) (int, error) {
        if config != nil {
                return -1, ErrSeccompNotEnabled
        }
        return -1, nil
}

// FlagSupported tells if a provided seccomp flag is supported.
func FlagSupported(_ specs.LinuxSeccompFlag) error {
        return ErrSeccompNotEnabled
}

// Version returns major, minor, and micro.
func Version() (uint, uint, uint) {
        return 0, 0, 0
}

// Enabled is true if seccomp support is compiled in.
const Enabled = false

package libcontainer

import (
        "errors"
        "fmt"
        "os"
        "os/exec"

        "github.com/opencontainers/selinux/go-selinux"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/apparmor"
        "github.com/opencontainers/runc/libcontainer/keys"
        "github.com/opencontainers/runc/libcontainer/seccomp"
        "github.com/opencontainers/runc/libcontainer/system"
        "github.com/opencontainers/runc/libcontainer/utils"
)

// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
        pipe          *syncSocket
        consoleSocket *os.File
        pidfdSocket   *os.File
        config        *initConfig
        logPipe       *os.File
}

func (l *linuxSetnsInit) getSessionRingName() string {
        return "_ses." + l.config.ContainerID
}

func (l *linuxSetnsInit) Init() error {
        if !l.config.Config.NoNewKeyring {
                if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
                        return err
                }
                defer selinux.SetKeyLabel("") //nolint: errcheck
                // Do not inherit the parent's session keyring.
                if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
                        // Same justification as in standart_init_linux.go as to why we
                        // don't bail on ENOSYS.
                        //
                        // TODO(cyphar): And we should have logging here too.
                        if !errors.Is(err, unix.ENOSYS) {
                                return fmt.Errorf("unable to join session keyring: %w", err)
                        }
                }
        }

        if l.config.CreateConsole {
                if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
                        return err
                }
                if err := system.Setctty(); err != nil {
                        return err
                }
        }
        if l.pidfdSocket != nil {
                if err := setupPidfd(l.pidfdSocket, "setns"); err != nil {
                        return fmt.Errorf("failed to setup pidfd: %w", err)
                }
        }
        if l.config.NoNewPrivileges {
                if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
                        return err
                }
        }
        if l.config.Config.Umask != nil {
                unix.Umask(int(*l.config.Config.Umask))
        }

        if err := setupScheduler(l.config); err != nil {
                return err
        }

        if err := setupIOPriority(l.config); err != nil {
                return err
        }
        // Tell our parent that we're ready to exec. This must be done before the
        // Seccomp rules have been applied, because we need to be able to read and
        // write to a socket.
        if err := syncParentReady(l.pipe); err != nil {
                return fmt.Errorf("sync ready: %w", err)
        }

        if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
                return err
        }
        defer selinux.SetExecLabel("") //nolint: errcheck
        // Without NoNewPrivileges seccomp is a privileged operation, so we need to
        // do this before dropping capabilities; otherwise do it as late as possible
        // just before execve so as few syscalls take place after it as possible.
        if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
                seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
                if err != nil {
                        return err
                }
                if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
                        return err
                }
        }
        if err := finalizeNamespace(l.config); err != nil {
                return err
        }
        if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
                return err
        }
        if l.config.Config.Personality != nil {
                if err := setupPersonality(l.config.Config); err != nil {
                        return err
                }
        }
        // Check for the arg early to make sure it exists.
        name, err := exec.LookPath(l.config.Args[0])
        if err != nil {
                return err
        }
        // Set seccomp as close to execve as possible, so as few syscalls take
        // place afterward (reducing the amount of syscalls that users need to
        // enable in their seccomp profiles).
        if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
                seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
                if err != nil {
                        return fmt.Errorf("unable to init seccomp: %w", err)
                }
                if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
                        return err
                }
        }

        // Close the pipe to signal that we have completed our init.
        // Please keep this because we don't want to get a pipe write error if
        // there is an error from `execve` after all fds closed.
        _ = l.pipe.Close()

        // Close the log pipe fd so the parent's ForwardLogs can exit.
        logrus.Debugf("setns_init: about to exec")
        if err := l.logPipe.Close(); err != nil {
                return fmt.Errorf("close log pipe: %w", err)
        }

        // Close all file descriptors we are not passing to the container. This is
        // necessary because the execve target could use internal runc fds as the
        // execve path, potentially giving access to binary files from the host
        // (which can then be opened by container processes, leading to container
        // escapes). Note that because this operation will close any open file
        // descriptors that are referenced by (*os.File) handles from underneath
        // the Go runtime, we must not do any file operations after this point
        // (otherwise the (*os.File) finaliser could close the wrong file). See
        // CVE-2024-21626 for more information as to why this protection is
        // necessary.
        if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
                return err
        }
        return system.Exec(name, l.config.Args, l.config.Env)
}

package specconv

import (
        "os"
        "path/filepath"
        "strings"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runtime-spec/specs-go"
)

// Example returns an example spec file, with many options set so a user can
// see what a standard spec file looks like.
func Example() *specs.Spec {
        spec := &specs.Spec{
                Version: specs.Version,
                Root: &specs.Root{
                        Path:     "rootfs",
                        Readonly: true,
                },
                Process: &specs.Process{
                        Terminal: true,
                        User:     specs.User{},
                        Args: []string{
                                "sh",
                        },
                        Env: []string{
                                "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
                                "TERM=xterm",
                        },
                        Cwd:             "/",
                        NoNewPrivileges: true,
                        Capabilities: &specs.LinuxCapabilities{
                                Bounding: []string{
                                        "CAP_AUDIT_WRITE",
                                        "CAP_KILL",
                                        "CAP_NET_BIND_SERVICE",
                                },
                                Permitted: []string{
                                        "CAP_AUDIT_WRITE",
                                        "CAP_KILL",
                                        "CAP_NET_BIND_SERVICE",
                                },
                                Effective: []string{
                                        "CAP_AUDIT_WRITE",
                                        "CAP_KILL",
                                        "CAP_NET_BIND_SERVICE",
                                },
                        },
                        Rlimits: []specs.POSIXRlimit{
                                {
                                        Type: "RLIMIT_NOFILE",
                                        Hard: uint64(1024),
                                        Soft: uint64(1024),
                                },
                        },
                },
                Hostname: "runc",
                Mounts: []specs.Mount{
                        {
                                Destination: "/proc",
                                Type:        "proc",
                                Source:      "proc",
                                Options:     nil,
                        },
                        {
                                Destination: "/dev",
                                Type:        "tmpfs",
                                Source:      "tmpfs",
                                Options:     []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
                        },
                        {
                                Destination: "/dev/pts",
                                Type:        "devpts",
                                Source:      "devpts",
                                Options:     []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
                        },
                        {
                                Destination: "/dev/shm",
                                Type:        "tmpfs",
                                Source:      "shm",
                                Options:     []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
                        },
                        {
                                Destination: "/dev/mqueue",
                                Type:        "mqueue",
                                Source:      "mqueue",
                                Options:     []string{"nosuid", "noexec", "nodev"},
                        },
                        {
                                Destination: "/sys",
                                Type:        "sysfs",
                                Source:      "sysfs",
                                Options:     []string{"nosuid", "noexec", "nodev", "ro"},
                        },
                        {
                                Destination: "/sys/fs/cgroup",
                                Type:        "cgroup",
                                Source:      "cgroup",
                                Options:     []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
                        },
                },
                Linux: &specs.Linux{
                        MaskedPaths: []string{
                                "/proc/acpi",
                                "/proc/asound",
                                "/proc/kcore",
                                "/proc/keys",
                                "/proc/latency_stats",
                                "/proc/timer_list",
                                "/proc/timer_stats",
                                "/proc/sched_debug",
                                "/sys/firmware",
                                "/proc/scsi",
                        },
                        ReadonlyPaths: []string{
                                "/proc/bus",
                                "/proc/fs",
                                "/proc/irq",
                                "/proc/sys",
                                "/proc/sysrq-trigger",
                        },
                        Resources: &specs.LinuxResources{
                                Devices: []specs.LinuxDeviceCgroup{
                                        {
                                                Allow:  false,
                                                Access: "rwm",
                                        },
                                },
                        },
                        Namespaces: []specs.LinuxNamespace{
                                {
                                        Type: specs.PIDNamespace,
                                },
                                {
                                        Type: specs.NetworkNamespace,
                                },
                                {
                                        Type: specs.IPCNamespace,
                                },
                                {
                                        Type: specs.UTSNamespace,
                                },
                                {
                                        Type: specs.MountNamespace,
                                },
                        },
                },
        }
        if cgroups.IsCgroup2UnifiedMode() {
                spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{
                        Type: specs.CgroupNamespace,
                })
        }
        return spec
}

// ToRootless converts the given spec file into one that should work with
// rootless containers (euid != 0), by removing incompatible options and adding others that
// are needed.
func ToRootless(spec *specs.Spec) {
        var namespaces []specs.LinuxNamespace

        // Remove networkns from the spec.
        for _, ns := range spec.Linux.Namespaces {
                switch ns.Type {
                case specs.NetworkNamespace, specs.UserNamespace:
                        // Do nothing.
                default:
                        namespaces = append(namespaces, ns)
                }
        }
        // Add userns to the spec.
        namespaces = append(namespaces, specs.LinuxNamespace{
                Type: specs.UserNamespace,
        })
        spec.Linux.Namespaces = namespaces

        // Add mappings for the current user.
        spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
                HostID:      uint32(os.Geteuid()),
                ContainerID: 0,
                Size:        1,
        }}
        spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
                HostID:      uint32(os.Getegid()),
                ContainerID: 0,
                Size:        1,
        }}

        // Fix up mounts.
        var mounts []specs.Mount
        for _, mount := range spec.Mounts {
                // Replace the /sys mount with an rbind.
                if filepath.Clean(mount.Destination) == "/sys" {
                        mounts = append(mounts, specs.Mount{
                                Source:      "/sys",
                                Destination: "/sys",
                                Type:        "none",
                                Options:     []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
                        })
                        continue
                }

                // Remove all gid= and uid= mappings.
                var options []string
                for _, option := range mount.Options {
                        if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
                                options = append(options, option)
                        }
                }

                mount.Options = options
                mounts = append(mounts, mount)
        }
        spec.Mounts = mounts

        // Remove cgroup settings.
        spec.Linux.Resources = nil
}

// Package specconv implements conversion of specifications to libcontainer
// configurations
package specconv

import (
        "errors"
        "fmt"
        "os"
        "path/filepath"
        "sort"
        "strings"
        "sync"
        "time"

        systemdDbus "github.com/coreos/go-systemd/v22/dbus"
        dbus "github.com/godbus/dbus/v5"
        "github.com/opencontainers/runc/libcontainer/cgroups"
        devices "github.com/opencontainers/runc/libcontainer/cgroups/devices/config"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/internal/userns"
        "github.com/opencontainers/runc/libcontainer/seccomp"
        libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/sirupsen/logrus"

        "golang.org/x/sys/unix"
)

var (
        initMapsOnce            sync.Once
        namespaceMapping        map[specs.LinuxNamespaceType]configs.NamespaceType
        mountPropagationMapping map[string]int
        recAttrFlags            map[string]struct {
                clear bool
                flag  uint64
        }
        mountFlags, extensionFlags map[string]struct {
                clear bool
                flag  int
        }
        complexFlags map[string]func(*configs.Mount)
)

func initMaps() {
        initMapsOnce.Do(func() {
                namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
                        specs.PIDNamespace:     configs.NEWPID,
                        specs.NetworkNamespace: configs.NEWNET,
                        specs.MountNamespace:   configs.NEWNS,
                        specs.UserNamespace:    configs.NEWUSER,
                        specs.IPCNamespace:     configs.NEWIPC,
                        specs.UTSNamespace:     configs.NEWUTS,
                        specs.CgroupNamespace:  configs.NEWCGROUP,
                        specs.TimeNamespace:    configs.NEWTIME,
                }

                mountPropagationMapping = map[string]int{
                        "rprivate":    unix.MS_PRIVATE | unix.MS_REC,
                        "private":     unix.MS_PRIVATE,
                        "rslave":      unix.MS_SLAVE | unix.MS_REC,
                        "slave":       unix.MS_SLAVE,
                        "rshared":     unix.MS_SHARED | unix.MS_REC,
                        "shared":      unix.MS_SHARED,
                        "runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
                        "unbindable":  unix.MS_UNBINDABLE,
                }

                mountFlags = map[string]struct {
                        clear bool
                        flag  int
                }{
                        // "acl" cannot be mapped to MS_POSIXACL: https://github.com/opencontainers/runc/issues/3738
                        "async":         {true, unix.MS_SYNCHRONOUS},
                        "atime":         {true, unix.MS_NOATIME},
                        "bind":          {false, unix.MS_BIND},
                        "defaults":      {false, 0},
                        "dev":           {true, unix.MS_NODEV},
                        "diratime":      {true, unix.MS_NODIRATIME},
                        "dirsync":       {false, unix.MS_DIRSYNC},
                        "exec":          {true, unix.MS_NOEXEC},
                        "iversion":      {false, unix.MS_I_VERSION},
                        "lazytime":      {false, unix.MS_LAZYTIME},
                        "loud":          {true, unix.MS_SILENT},
                        "mand":          {false, unix.MS_MANDLOCK},
                        "noatime":       {false, unix.MS_NOATIME},
                        "nodev":         {false, unix.MS_NODEV},
                        "nodiratime":    {false, unix.MS_NODIRATIME},
                        "noexec":        {false, unix.MS_NOEXEC},
                        "noiversion":    {true, unix.MS_I_VERSION},
                        "nolazytime":    {true, unix.MS_LAZYTIME},
                        "nomand":        {true, unix.MS_MANDLOCK},
                        "norelatime":    {true, unix.MS_RELATIME},
                        "nostrictatime": {true, unix.MS_STRICTATIME},
                        "nosuid":        {false, unix.MS_NOSUID},
                        "nosymfollow":   {false, unix.MS_NOSYMFOLLOW}, // since kernel 5.10
                        "rbind":         {false, unix.MS_BIND | unix.MS_REC},
                        "relatime":      {false, unix.MS_RELATIME},
                        "remount":       {false, unix.MS_REMOUNT},
                        "ro":            {false, unix.MS_RDONLY},
                        "rw":            {true, unix.MS_RDONLY},
                        "silent":        {false, unix.MS_SILENT},
                        "strictatime":   {false, unix.MS_STRICTATIME},
                        "suid":          {true, unix.MS_NOSUID},
                        "sync":          {false, unix.MS_SYNCHRONOUS},
                        "symfollow":     {true, unix.MS_NOSYMFOLLOW}, // since kernel 5.10
                }

                recAttrFlags = map[string]struct {
                        clear bool
                        flag  uint64
                }{
                        "rro":            {false, unix.MOUNT_ATTR_RDONLY},
                        "rrw":            {true, unix.MOUNT_ATTR_RDONLY},
                        "rnosuid":        {false, unix.MOUNT_ATTR_NOSUID},
                        "rsuid":          {true, unix.MOUNT_ATTR_NOSUID},
                        "rnodev":         {false, unix.MOUNT_ATTR_NODEV},
                        "rdev":           {true, unix.MOUNT_ATTR_NODEV},
                        "rnoexec":        {false, unix.MOUNT_ATTR_NOEXEC},
                        "rexec":          {true, unix.MOUNT_ATTR_NOEXEC},
                        "rnodiratime":    {false, unix.MOUNT_ATTR_NODIRATIME},
                        "rdiratime":      {true, unix.MOUNT_ATTR_NODIRATIME},
                        "rrelatime":      {false, unix.MOUNT_ATTR_RELATIME},
                        "rnorelatime":    {true, unix.MOUNT_ATTR_RELATIME},
                        "rnoatime":       {false, unix.MOUNT_ATTR_NOATIME},
                        "ratime":         {true, unix.MOUNT_ATTR_NOATIME},
                        "rstrictatime":   {false, unix.MOUNT_ATTR_STRICTATIME},
                        "rnostrictatime": {true, unix.MOUNT_ATTR_STRICTATIME},
                        "rnosymfollow":   {false, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14
                        "rsymfollow":     {true, unix.MOUNT_ATTR_NOSYMFOLLOW},  // since kernel 5.14
                }

                extensionFlags = map[string]struct {
                        clear bool
                        flag  int
                }{
                        "tmpcopyup": {false, configs.EXT_COPYUP},
                }

                complexFlags = map[string]func(*configs.Mount){
                        "idmap": func(m *configs.Mount) {
                                m.IDMapping = new(configs.MountIDMapping)
                                m.IDMapping.Recursive = false // noop
                        },
                        "ridmap": func(m *configs.Mount) {
                                m.IDMapping = new(configs.MountIDMapping)
                                m.IDMapping.Recursive = true
                        },
                }
        })
}

// KnownNamespaces returns the list of the known namespaces.
// Used by `runc features`.
func KnownNamespaces() []string {
        initMaps()
        var res []string
        for k := range namespaceMapping {
                res = append(res, string(k))
        }
        sort.Strings(res)
        return res
}

// KnownMountOptions returns the list of the known mount options.
// Used by `runc features`.
func KnownMountOptions() []string {
        initMaps()
        var res []string
        for k := range mountFlags {
                res = append(res, k)
        }
        for k := range mountPropagationMapping {
                res = append(res, k)
        }
        for k := range recAttrFlags {
                res = append(res, k)
        }
        for k := range extensionFlags {
                res = append(res, k)
        }
        sort.Strings(res)
        return res
}

// AllowedDevices is the set of devices which are automatically included for
// all containers.
//
// # XXX (cyphar)
//
// This behaviour is at the very least "questionable" (if not outright
// wrong) according to the runtime-spec.
//
// Yes, we have to include certain devices other than the ones the user
// specifies, but several devices listed here are not part of the spec
// (including "mknod for any device"?!). In addition, these rules are
// appended to the user-provided set which means that users *cannot disable
// this behaviour*.
//
// ... unfortunately I'm too scared to change this now because who knows how
// many people depend on this (incorrect and arguably insecure) behaviour.
var AllowedDevices = []*devices.Device{
        // allow mknod for any device
        {
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       devices.Wildcard,
                        Minor:       devices.Wildcard,
                        Permissions: "m",
                        Allow:       true,
                },
        },
        {
                Rule: devices.Rule{
                        Type:        devices.BlockDevice,
                        Major:       devices.Wildcard,
                        Minor:       devices.Wildcard,
                        Permissions: "m",
                        Allow:       true,
                },
        },
        {
                Path:     "/dev/null",
                FileMode: 0o666,
                Uid:      0,
                Gid:      0,
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       1,
                        Minor:       3,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        {
                Path:     "/dev/random",
                FileMode: 0o666,
                Uid:      0,
                Gid:      0,
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       1,
                        Minor:       8,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        {
                Path:     "/dev/full",
                FileMode: 0o666,
                Uid:      0,
                Gid:      0,
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       1,
                        Minor:       7,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        {
                Path:     "/dev/tty",
                FileMode: 0o666,
                Uid:      0,
                Gid:      0,
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       5,
                        Minor:       0,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        {
                Path:     "/dev/zero",
                FileMode: 0o666,
                Uid:      0,
                Gid:      0,
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       1,
                        Minor:       5,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        {
                Path:     "/dev/urandom",
                FileMode: 0o666,
                Uid:      0,
                Gid:      0,
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       1,
                        Minor:       9,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        // /dev/pts/ - pts namespaces are "coming soon"
        {
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       136,
                        Minor:       devices.Wildcard,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        {
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       5,
                        Minor:       2,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
        // The following entry for /dev/net/tun device was there from the
        // very early days of Docker, but got removed in runc 1.2.0-rc1,
        // causing a number of regressions for users (see
        // https://github.com/opencontainers/runc/pull/3468).
        //
        // Some upper-level orcherstration tools makes it either impossible
        // or cumbersome to supply additional device rules, so we have to
        // keep this for the sake of backward compatibility.
        {
                Rule: devices.Rule{
                        Type:        devices.CharDevice,
                        Major:       10,
                        Minor:       200,
                        Permissions: "rwm",
                        Allow:       true,
                },
        },
}

type CreateOpts struct {
        CgroupName       string
        UseSystemdCgroup bool
        NoPivotRoot      bool
        NoNewKeyring     bool
        Spec             *specs.Spec
        RootlessEUID     bool
        RootlessCgroups  bool
}

// getwd is a wrapper similar to os.Getwd, except it always gets
// the value from the kernel, which guarantees the returned value
// to be absolute and clean.
func getwd() (wd string, err error) {
        for {
                wd, err = unix.Getwd()
                if err != unix.EINTR {
                        break
                }
        }
        return wd, os.NewSyscallError("getwd", err)
}

// CreateLibcontainerConfig creates a new libcontainer configuration from a
// given specification and a cgroup name
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
        // runc's cwd will always be the bundle path
        cwd, err := getwd()
        if err != nil {
                return nil, err
        }
        spec := opts.Spec
        if spec.Root == nil {
                return nil, errors.New("root must be specified")
        }
        rootfsPath := spec.Root.Path
        if !filepath.IsAbs(rootfsPath) {
                rootfsPath = filepath.Join(cwd, rootfsPath)
        }
        labels := []string{}
        for k, v := range spec.Annotations {
                labels = append(labels, k+"="+v)
        }
        config := &configs.Config{
                Rootfs:          rootfsPath,
                NoPivotRoot:     opts.NoPivotRoot,
                Readonlyfs:      spec.Root.Readonly,
                Hostname:        spec.Hostname,
                Domainname:      spec.Domainname,
                Labels:          append(labels, "bundle="+cwd),
                NoNewKeyring:    opts.NoNewKeyring,
                RootlessEUID:    opts.RootlessEUID,
                RootlessCgroups: opts.RootlessCgroups,
        }

        for _, m := range spec.Mounts {
                cm, err := createLibcontainerMount(cwd, m)
                if err != nil {
                        return nil, fmt.Errorf("invalid mount %+v: %w", m, err)
                }
                config.Mounts = append(config.Mounts, cm)
        }

        defaultDevs, err := createDevices(spec, config)
        if err != nil {
                return nil, err
        }

        c, err := CreateCgroupConfig(opts, defaultDevs)
        if err != nil {
                return nil, err
        }

        config.Cgroups = c
        // set linux-specific config
        if spec.Linux != nil {
                initMaps()

                if spec.Linux.RootfsPropagation != "" {
                        var exists bool
                        if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
                                return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
                        }
                        if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
                                return nil, errors.New("rootfsPropagation of [r]private is not safe without pivot_root")
                        }
                }

                for _, ns := range spec.Linux.Namespaces {
                        t, exists := namespaceMapping[ns.Type]
                        if !exists {
                                return nil, fmt.Errorf("namespace %q does not exist", ns)
                        }
                        if config.Namespaces.Contains(t) {
                                return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
                        }
                        config.Namespaces.Add(t, ns.Path)
                }
                if config.Namespaces.IsPrivate(configs.NEWNET) {
                        config.Networks = []*configs.Network{
                                {
                                        Type: "loopback",
                                },
                        }
                }
                if config.Namespaces.Contains(configs.NEWUSER) {
                        if err := setupUserNamespace(spec, config); err != nil {
                                return nil, err
                        }
                        // For idmap and ridmap mounts without explicit mappings, use the
                        // ones from the container's userns. If we are joining another
                        // userns, stash the path.
                        for _, m := range config.Mounts {
                                if m.IDMapping != nil && m.IDMapping.UIDMappings == nil && m.IDMapping.GIDMappings == nil {
                                        if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" {
                                                m.IDMapping.UserNSPath = path
                                        } else {
                                                m.IDMapping.UIDMappings = config.UIDMappings
                                                m.IDMapping.GIDMappings = config.GIDMappings
                                        }
                                }
                        }
                }
                config.MaskPaths = spec.Linux.MaskedPaths
                config.ReadonlyPaths = spec.Linux.ReadonlyPaths
                config.MountLabel = spec.Linux.MountLabel
                config.Sysctl = spec.Linux.Sysctl
                config.TimeOffsets = spec.Linux.TimeOffsets
                if spec.Linux.Seccomp != nil {
                        seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
                        if err != nil {
                                return nil, err
                        }
                        config.Seccomp = seccomp
                }
                if spec.Linux.IntelRdt != nil {
                        config.IntelRdt = &configs.IntelRdt{
                                ClosID:        spec.Linux.IntelRdt.ClosID,
                                L3CacheSchema: spec.Linux.IntelRdt.L3CacheSchema,
                                MemBwSchema:   spec.Linux.IntelRdt.MemBwSchema,
                        }
                }
                if spec.Linux.Personality != nil {
                        if len(spec.Linux.Personality.Flags) > 0 {
                                logrus.Warnf("ignoring unsupported personality flags: %+v because personality flag has not supported at this time", spec.Linux.Personality.Flags)
                        }
                        domain, err := getLinuxPersonalityFromStr(string(spec.Linux.Personality.Domain))
                        if err != nil {
                                return nil, err
                        }
                        config.Personality = &configs.LinuxPersonality{
                                Domain: domain,
                        }
                }

        }

        // Set the host UID that should own the container's cgroup.
        // This must be performed after setupUserNamespace, so that
        // config.HostRootUID() returns the correct result.
        //
        // Only set it if the container will have its own cgroup
        // namespace and the cgroupfs will be mounted read/write.
        //
        hasCgroupNS := config.Namespaces.IsPrivate(configs.NEWCGROUP)
        hasRwCgroupfs := false
        if hasCgroupNS {
                for _, m := range config.Mounts {
                        if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 {
                                hasRwCgroupfs = true
                                break
                        }
                }
        }
        processUid := 0
        if spec.Process != nil {
                // Chown the cgroup to the UID running the process,
                // which is not necessarily UID 0 in the container
                // namespace (e.g., an unprivileged UID in the host
                // user namespace).
                processUid = int(spec.Process.User.UID)
        }
        if hasCgroupNS && hasRwCgroupfs {
                ownerUid, err := config.HostUID(processUid)
                // There are two error cases; we can ignore both.
                //
                // 1. uidMappings is unset.  Either there is no user
                //    namespace (fine), or it is an error (which is
                //    checked elsewhere).
                //
                // 2. The user is unmapped in the user namespace.  This is an
                //    unusual configuration and might be an error.  But it too
                //    will be checked elsewhere, so we can ignore it here.
                //
                if err == nil {
                        config.Cgroups.OwnerUID = &ownerUid
                }
        }

        if spec.Process != nil {
                config.OomScoreAdj = spec.Process.OOMScoreAdj
                config.NoNewPrivileges = spec.Process.NoNewPrivileges
                config.Umask = spec.Process.User.Umask
                config.ProcessLabel = spec.Process.SelinuxLabel
                if spec.Process.Capabilities != nil {
                        config.Capabilities = &configs.Capabilities{
                                Bounding:    spec.Process.Capabilities.Bounding,
                                Effective:   spec.Process.Capabilities.Effective,
                                Permitted:   spec.Process.Capabilities.Permitted,
                                Inheritable: spec.Process.Capabilities.Inheritable,
                                Ambient:     spec.Process.Capabilities.Ambient,
                        }
                }
                if spec.Process.Scheduler != nil {
                        s := *spec.Process.Scheduler
                        config.Scheduler = &s
                }

                if spec.Process.IOPriority != nil {
                        ioPriority := *spec.Process.IOPriority
                        config.IOPriority = &ioPriority
                }
        }
        createHooks(spec, config)
        config.Version = specs.Version
        return config, nil
}

func toConfigIDMap(specMaps []specs.LinuxIDMapping) []configs.IDMap {
        if specMaps == nil {
                return nil
        }
        idmaps := make([]configs.IDMap, len(specMaps))
        for i, id := range specMaps {
                idmaps[i] = configs.IDMap{
                        ContainerID: int64(id.ContainerID),
                        HostID:      int64(id.HostID),
                        Size:        int64(id.Size),
                }
        }
        return idmaps
}

func createLibcontainerMount(cwd string, m specs.Mount) (*configs.Mount, error) {
        if !filepath.IsAbs(m.Destination) {
                // Relax validation for backward compatibility
                // TODO (runc v1.x.x): change warning to an error
                // return nil, fmt.Errorf("mount destination %s is not absolute", m.Destination)
                logrus.Warnf("mount destination %s is not absolute. Support for non-absolute mount destinations will be removed in a future release.", m.Destination)
        }
        mnt := parseMountOptions(m.Options)

        mnt.Destination = m.Destination
        mnt.Source = m.Source
        mnt.Device = m.Type
        if mnt.Flags&unix.MS_BIND != 0 {
                // Any "type" the user specified is meaningless (and ignored) for
                // bind-mounts -- so we set it to "bind" because rootfs_linux.go
                // (incorrectly) relies on this for some checks.
                mnt.Device = "bind"
                if !filepath.IsAbs(mnt.Source) {
                        mnt.Source = filepath.Join(cwd, m.Source)
                }
        }

        if m.UIDMappings != nil || m.GIDMappings != nil {
                if mnt.IDMapping == nil {
                        // Neither "idmap" nor "ridmap" were specified.
                        mnt.IDMapping = new(configs.MountIDMapping)
                }
                mnt.IDMapping.UIDMappings = toConfigIDMap(m.UIDMappings)
                mnt.IDMapping.GIDMappings = toConfigIDMap(m.GIDMappings)
        }

        // None of the mount arguments can contain a null byte. Normally such
        // strings would either cause some other failure or would just be truncated
        // when we hit the null byte, but because we serialise these strings as
        // netlink messages (which don't have special null-byte handling) we need
        // to block this as early as possible.
        if strings.IndexByte(mnt.Source, 0) >= 0 ||
                strings.IndexByte(mnt.Destination, 0) >= 0 ||
                strings.IndexByte(mnt.Device, 0) >= 0 {
                return nil, errors.New("mount field contains null byte")
        }

        return mnt, nil
}

// checkPropertyName checks if systemd property name is valid. A valid name
// should consist of latin letters only, and have least 3 of them.
func checkPropertyName(s string) error {
        if len(s) < 3 {
                return errors.New("too short")
        }
        // Check ASCII characters rather than Unicode runes,
        // so we have to use indexes rather than range.
        for i := 0; i < len(s); i++ {
                ch := s[i]
                if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') {
                        continue
                }
                return errors.New("contains non-alphabetic character")
        }
        return nil
}

// getLinuxPersonalityFromStr converts the string domain received from spec to equivalent integer.
func getLinuxPersonalityFromStr(domain string) (int, error) {
        if domain == string(specs.PerLinux32) {
                return configs.PerLinux32, nil
        } else if domain == string(specs.PerLinux) {
                return configs.PerLinux, nil
        }
        return -1, fmt.Errorf("invalid personality domain %s", domain)
}

// Some systemd properties are documented as having "Sec" suffix
// (e.g. TimeoutStopSec) but are expected to have "USec" suffix
// here, so let's provide conversion to improve compatibility.
func convertSecToUSec(value dbus.Variant) (dbus.Variant, error) {
        var sec uint64
        const M = 1000000
        vi := value.Value()
        switch value.Signature().String() {
        case "y":
                sec = uint64(vi.(byte)) * M
        case "n":
                sec = uint64(vi.(int16)) * M
        case "q":
                sec = uint64(vi.(uint16)) * M
        case "i":
                sec = uint64(vi.(int32)) * M
        case "u":
                sec = uint64(vi.(uint32)) * M
        case "x":
                sec = uint64(vi.(int64)) * M
        case "t":
                sec = vi.(uint64) * M
        case "d":
                sec = uint64(vi.(float64) * M)
        default:
                return value, errors.New("not a number")
        }
        return dbus.MakeVariant(sec), nil
}

func initSystemdProps(spec *specs.Spec) ([]systemdDbus.Property, error) {
        const keyPrefix = "org.systemd.property."
        var sp []systemdDbus.Property

        for k, v := range spec.Annotations {
                name, ok := strings.CutPrefix(k, keyPrefix)
                if !ok { // prefix not there
                        continue
                }
                if err := checkPropertyName(name); err != nil {
                        return nil, fmt.Errorf("annotation %s name incorrect: %w", k, err)
                }
                value, err := dbus.ParseVariant(v, dbus.Signature{})
                if err != nil {
                        return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err)
                }
                // Check for Sec suffix.
                if trimName := strings.TrimSuffix(name, "Sec"); len(trimName) < len(name) {
                        // Check for a lowercase ascii a-z just before Sec.
                        if ch := trimName[len(trimName)-1]; ch >= 'a' && ch <= 'z' {
                                // Convert from Sec to USec.
                                name = trimName + "USec"
                                value, err = convertSecToUSec(value)
                                if err != nil {
                                        return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err)
                                }
                        }
                }
                sp = append(sp, systemdDbus.Property{Name: name, Value: value})
        }

        return sp, nil
}

func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*cgroups.Cgroup, error) {
        var (
                myCgroupPath string

                spec             = opts.Spec
                useSystemdCgroup = opts.UseSystemdCgroup
                name             = opts.CgroupName
        )

        c := &cgroups.Cgroup{
                Systemd:   useSystemdCgroup,
                Rootless:  opts.RootlessCgroups,
                Resources: &cgroups.Resources{},
        }

        if useSystemdCgroup {
                sp, err := initSystemdProps(spec)
                if err != nil {
                        return nil, err
                }
                c.SystemdProps = sp
        }

        if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
                if useSystemdCgroup {
                        myCgroupPath = spec.Linux.CgroupsPath
                } else {
                        myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
                }
        }

        if useSystemdCgroup {
                if myCgroupPath == "" {
                        // Default for c.Parent is set by systemd cgroup drivers.
                        c.ScopePrefix = "runc"
                        c.Name = name
                } else {
                        // Parse the path from expected "slice:prefix:name"
                        // for e.g. "system.slice:docker:1234"
                        parts := strings.Split(myCgroupPath, ":")
                        if len(parts) != 3 {
                                return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath)
                        }
                        c.Parent = parts[0]
                        c.ScopePrefix = parts[1]
                        c.Name = parts[2]
                }
        } else {
                if myCgroupPath == "" {
                        c.Name = name
                }
                c.Path = myCgroupPath
        }

        // In rootless containers, any attempt to make cgroup changes is likely to fail.
        // libcontainer will validate this but ignores the error.
        if spec.Linux != nil {
                r := spec.Linux.Resources
                if r != nil {
                        for i, d := range r.Devices {
                                var (
                                        t     = "a"
                                        major = int64(-1)
                                        minor = int64(-1)
                                )
                                if d.Type != "" {
                                        t = d.Type
                                }
                                if d.Major != nil {
                                        major = *d.Major
                                }
                                if d.Minor != nil {
                                        minor = *d.Minor
                                }
                                if d.Access == "" {
                                        return nil, fmt.Errorf("device access at %d field cannot be empty", i)
                                }
                                dt, err := stringToCgroupDeviceRune(t)
                                if err != nil {
                                        return nil, err
                                }
                                c.Resources.Devices = append(c.Resources.Devices, &devices.Rule{
                                        Type:        dt,
                                        Major:       major,
                                        Minor:       minor,
                                        Permissions: devices.Permissions(d.Access),
                                        Allow:       d.Allow,
                                })
                        }
                        if r.Memory != nil {
                                if r.Memory.Limit != nil {
                                        c.Resources.Memory = *r.Memory.Limit
                                }
                                if r.Memory.Reservation != nil {
                                        c.Resources.MemoryReservation = *r.Memory.Reservation
                                }
                                if r.Memory.Swap != nil {
                                        c.Resources.MemorySwap = *r.Memory.Swap
                                }
                                if r.Memory.Kernel != nil || r.Memory.KernelTCP != nil { //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility.
                                        logrus.Warn("Kernel memory settings are ignored and will be removed")
                                }
                                if r.Memory.Swappiness != nil {
                                        c.Resources.MemorySwappiness = r.Memory.Swappiness
                                }
                                if r.Memory.DisableOOMKiller != nil {
                                        c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
                                }
                                if r.Memory.CheckBeforeUpdate != nil {
                                        c.Resources.MemoryCheckBeforeUpdate = *r.Memory.CheckBeforeUpdate
                                }
                        }
                        if r.CPU != nil {
                                if r.CPU.Shares != nil {
                                        c.Resources.CpuShares = *r.CPU.Shares

                                        // CpuWeight is used for cgroupv2 and should be converted
                                        c.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(c.Resources.CpuShares)
                                }
                                if r.CPU.Quota != nil {
                                        c.Resources.CpuQuota = *r.CPU.Quota
                                }
                                if r.CPU.Burst != nil {
                                        c.Resources.CpuBurst = r.CPU.Burst
                                }
                                if r.CPU.Period != nil {
                                        c.Resources.CpuPeriod = *r.CPU.Period
                                }
                                if r.CPU.RealtimeRuntime != nil {
                                        c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
                                }
                                if r.CPU.RealtimePeriod != nil {
                                        c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
                                }
                                c.Resources.CpusetCpus = r.CPU.Cpus
                                c.Resources.CpusetMems = r.CPU.Mems
                                c.Resources.CPUIdle = r.CPU.Idle
                        }
                        if r.Pids != nil {
                                c.Resources.PidsLimit = r.Pids.Limit
                        }
                        if r.BlockIO != nil {
                                if r.BlockIO.Weight != nil {
                                        c.Resources.BlkioWeight = *r.BlockIO.Weight
                                }
                                if r.BlockIO.LeafWeight != nil {
                                        c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
                                }
                                for _, wd := range r.BlockIO.WeightDevice {
                                        var weight, leafWeight uint16
                                        if wd.Weight != nil {
                                                weight = *wd.Weight
                                        }
                                        if wd.LeafWeight != nil {
                                                leafWeight = *wd.LeafWeight
                                        }
                                        weightDevice := cgroups.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
                                        c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
                                }
                                for _, td := range r.BlockIO.ThrottleReadBpsDevice {
                                        rate := td.Rate
                                        throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate)
                                        c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
                                }
                                for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
                                        rate := td.Rate
                                        throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate)
                                        c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
                                }
                                for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
                                        rate := td.Rate
                                        throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate)
                                        c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
                                }
                                for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
                                        rate := td.Rate
                                        throttleDevice := cgroups.NewThrottleDevice(td.Major, td.Minor, rate)
                                        c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
                                }
                        }
                        for _, l := range r.HugepageLimits {
                                c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &cgroups.HugepageLimit{
                                        Pagesize: l.Pagesize,
                                        Limit:    l.Limit,
                                })
                        }
                        if len(r.Rdma) > 0 {
                                c.Resources.Rdma = make(map[string]cgroups.LinuxRdma, len(r.Rdma))
                                for k, v := range r.Rdma {
                                        c.Resources.Rdma[k] = cgroups.LinuxRdma{
                                                HcaHandles: v.HcaHandles,
                                                HcaObjects: v.HcaObjects,
                                        }
                                }
                        }
                        if r.Network != nil {
                                if r.Network.ClassID != nil {
                                        c.Resources.NetClsClassid = *r.Network.ClassID
                                }
                                for _, m := range r.Network.Priorities {
                                        c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &cgroups.IfPrioMap{
                                                Interface: m.Name,
                                                Priority:  int64(m.Priority),
                                        })
                                }
                        }
                        if len(r.Unified) > 0 {
                                // copy the map
                                c.Resources.Unified = make(map[string]string, len(r.Unified))
                                for k, v := range r.Unified {
                                        c.Resources.Unified[k] = v
                                }
                        }
                }
        }

        // Append the default allowed devices to the end of the list.
        for _, device := range defaultDevs {
                c.Resources.Devices = append(c.Resources.Devices, &device.Rule)
        }
        return c, nil
}

func stringToCgroupDeviceRune(s string) (devices.Type, error) {
        switch s {
        case "a":
                return devices.WildcardDevice, nil
        case "b":
                return devices.BlockDevice, nil
        case "c":
                return devices.CharDevice, nil
        default:
                return 0, fmt.Errorf("invalid cgroup device type %q", s)
        }
}

func stringToDeviceRune(s string) (devices.Type, error) {
        switch s {
        case "p":
                return devices.FifoDevice, nil
        case "u", "c":
                return devices.CharDevice, nil
        case "b":
                return devices.BlockDevice, nil
        default:
                return 0, fmt.Errorf("invalid device type %q", s)
        }
}

func createDevices(spec *specs.Spec, config *configs.Config) ([]*devices.Device, error) {
        // If a spec device is redundant with a default device, remove that default
        // device (the spec one takes priority).
        dedupedAllowDevs := []*devices.Device{}

next:
        for _, ad := range AllowedDevices {
                if ad.Path != "" && spec.Linux != nil {
                        for _, sd := range spec.Linux.Devices {
                                if sd.Path == ad.Path {
                                        continue next
                                }
                        }
                }
                dedupedAllowDevs = append(dedupedAllowDevs, ad)
                if ad.Path != "" {
                        config.Devices = append(config.Devices, ad)
                }
        }

        // Merge in additional devices from the spec.
        if spec.Linux != nil {
                for _, d := range spec.Linux.Devices {
                        var uid, gid uint32
                        var filemode os.FileMode = 0o666

                        if d.UID != nil {
                                uid = *d.UID
                        }
                        if d.GID != nil {
                                gid = *d.GID
                        }
                        dt, err := stringToDeviceRune(d.Type)
                        if err != nil {
                                return nil, err
                        }
                        if d.FileMode != nil {
                                filemode = *d.FileMode &^ unix.S_IFMT
                        }
                        device := &devices.Device{
                                Rule: devices.Rule{
                                        Type:  dt,
                                        Major: d.Major,
                                        Minor: d.Minor,
                                },
                                Path:     d.Path,
                                FileMode: filemode,
                                Uid:      uid,
                                Gid:      gid,
                        }
                        config.Devices = append(config.Devices, device)
                }
        }

        return dedupedAllowDevs, nil
}

func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
        if spec.Linux != nil {
                config.UIDMappings = toConfigIDMap(spec.Linux.UIDMappings)
                config.GIDMappings = toConfigIDMap(spec.Linux.GIDMappings)
        }
        if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" {
                // Cache the current userns mappings in our configuration, so that we
                // can calculate uid and gid mappings within runc. These mappings are
                // never used for configuring the container if the path is set.
                uidMap, gidMap, err := userns.GetUserNamespaceMappings(path)
                if err != nil {
                        return fmt.Errorf("failed to cache mappings for userns: %w", err)
                }
                // We cannot allow uid or gid mappings to be set if we are also asked
                // to join a userns.
                if config.UIDMappings != nil || config.GIDMappings != nil {
                        // FIXME: It turns out that containerd and CRIO pass both a userns
                        // path and the mappings of the namespace in the same config.json.
                        // Such a configuration is technically not valid, but we used to
                        // require mappings be specified, and thus users worked around our
                        // bug -- so we can't regress it at the moment. But we also don't
                        // want to produce broken behaviour if the mapping doesn't match
                        // the userns. So (for now) we output a warning if the actual
                        // userns mappings match the configuration, otherwise we return an
                        // error.
                        if !userns.IsSameMapping(uidMap, config.UIDMappings) ||
                                !userns.IsSameMapping(gidMap, config.GIDMappings) {
                                return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one")
                        }
                        logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on <https://github.com/opencontainers/runc> if you see this warning and cannot update your configuration.")
                }

                config.UIDMappings = uidMap
                config.GIDMappings = gidMap
                logrus.WithFields(logrus.Fields{
                        "uid_map": uidMap,
                        "gid_map": gidMap,
                }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached")
        }
        rootUID, err := config.HostRootUID()
        if err != nil {
                return err
        }
        rootGID, err := config.HostRootGID()
        if err != nil {
                return err
        }
        for _, node := range config.Devices {
                node.Uid = uint32(rootUID)
                node.Gid = uint32(rootGID)
        }
        return nil
}

// parseMountOptions parses options and returns a configs.Mount
// structure with fields that depends on options set accordingly.
func parseMountOptions(options []string) *configs.Mount {
        var (
                data                   []string
                m                      configs.Mount
                recAttrSet, recAttrClr uint64
        )
        initMaps()
        for _, o := range options {
                // If the option does not exist in the mountFlags table,
                // or the flag is not supported on the platform,
                // then it is a data value for a specific fs type.
                if f, exists := mountFlags[o]; exists && f.flag != 0 {
                        // FIXME: The *atime flags are special (they are more of an enum
                        // with quite hairy semantics) and thus arguably setting some of
                        // them should clear unrelated flags.
                        if f.clear {
                                m.Flags &= ^f.flag
                                m.ClearedFlags |= f.flag
                        } else {
                                m.Flags |= f.flag
                                m.ClearedFlags &= ^f.flag
                        }
                } else if f, exists := mountPropagationMapping[o]; exists && f != 0 {
                        m.PropagationFlags = append(m.PropagationFlags, f)
                } else if f, exists := recAttrFlags[o]; exists {
                        if f.clear {
                                recAttrClr |= f.flag
                                recAttrSet &= ^f.flag
                        } else {
                                recAttrSet |= f.flag
                                recAttrClr &= ^f.flag
                                if f.flag&unix.MOUNT_ATTR__ATIME == f.flag {
                                        // https://man7.org/linux/man-pages/man2/mount_setattr.2.html
                                        // "cannot simply specify the access-time setting in attr_set, but must also include MOUNT_ATTR__ATIME in the attr_clr field."
                                        recAttrClr |= unix.MOUNT_ATTR__ATIME
                                }
                        }
                } else if f, exists := extensionFlags[o]; exists {
                        if f.clear {
                                m.Extensions &= ^f.flag
                        } else {
                                m.Extensions |= f.flag
                        }
                } else if fn, exists := complexFlags[o]; exists {
                        fn(&m)
                } else {
                        data = append(data, o)
                }
        }
        m.Data = strings.Join(data, ",")
        if recAttrSet != 0 || recAttrClr != 0 {
                m.RecAttr = &unix.MountAttr{
                        Attr_set: recAttrSet,
                        Attr_clr: recAttrClr,
                }
        }
        return &m
}

func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
        if config == nil {
                return nil, nil
        }

        // No default action specified, no syscalls listed, assume seccomp disabled
        if config.DefaultAction == "" && len(config.Syscalls) == 0 {
                return nil, nil
        }

        newConfig := new(configs.Seccomp)
        newConfig.Syscalls = []*configs.Syscall{}

        // The list of flags defined in runtime-spec is a subset of the flags
        // in the seccomp() syscall.
        if config.Flags == nil {
                // No flags are set explicitly (not even the empty set);
                // set the default of specs.LinuxSeccompFlagSpecAllow,
                // if it is supported by the libseccomp and the kernel.
                if err := seccomp.FlagSupported(specs.LinuxSeccompFlagSpecAllow); err == nil {
                        newConfig.Flags = []specs.LinuxSeccompFlag{specs.LinuxSeccompFlagSpecAllow}
                }
        } else {
                // Fail early if some flags are unknown or unsupported.
                for _, flag := range config.Flags {
                        if err := seccomp.FlagSupported(flag); err != nil {
                                return nil, err
                        }
                        newConfig.Flags = append(newConfig.Flags, flag)
                }
        }

        if len(config.Architectures) > 0 {
                newConfig.Architectures = []string{}
                for _, arch := range config.Architectures {
                        newArch, err := seccomp.ConvertStringToArch(string(arch))
                        if err != nil {
                                return nil, err
                        }
                        newConfig.Architectures = append(newConfig.Architectures, newArch)
                }
        }

        // Convert default action from string representation
        newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
        if err != nil {
                return nil, err
        }
        newConfig.DefaultAction = newDefaultAction
        newConfig.DefaultErrnoRet = config.DefaultErrnoRet

        newConfig.ListenerPath = config.ListenerPath
        newConfig.ListenerMetadata = config.ListenerMetadata

        // Loop through all syscall blocks and convert them to libcontainer format
        for _, call := range config.Syscalls {
                newAction, err := seccomp.ConvertStringToAction(string(call.Action))
                if err != nil {
                        return nil, err
                }

                for _, name := range call.Names {
                        newCall := configs.Syscall{
                                Name:     name,
                                Action:   newAction,
                                ErrnoRet: call.ErrnoRet,
                                Args:     []*configs.Arg{},
                        }
                        // Loop through all the arguments of the syscall and convert them
                        for _, arg := range call.Args {
                                newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
                                if err != nil {
                                        return nil, err
                                }

                                newArg := configs.Arg{
                                        Index:    arg.Index,
                                        Value:    arg.Value,
                                        ValueTwo: arg.ValueTwo,
                                        Op:       newOp,
                                }

                                newCall.Args = append(newCall.Args, &newArg)
                        }
                        newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
                }
        }

        return newConfig, nil
}

func createHooks(rspec *specs.Spec, config *configs.Config) {
        config.Hooks = configs.Hooks{}
        if rspec.Hooks != nil {
                for _, h := range rspec.Hooks.Prestart { //nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility.
                        cmd := createCommandHook(h)
                        config.Hooks[configs.Prestart] = append(config.Hooks[configs.Prestart], configs.NewCommandHook(cmd))
                }
                for _, h := range rspec.Hooks.CreateRuntime {
                        cmd := createCommandHook(h)
                        config.Hooks[configs.CreateRuntime] = append(config.Hooks[configs.CreateRuntime], configs.NewCommandHook(cmd))
                }
                for _, h := range rspec.Hooks.CreateContainer {
                        cmd := createCommandHook(h)
                        config.Hooks[configs.CreateContainer] = append(config.Hooks[configs.CreateContainer], configs.NewCommandHook(cmd))
                }
                for _, h := range rspec.Hooks.StartContainer {
                        cmd := createCommandHook(h)
                        config.Hooks[configs.StartContainer] = append(config.Hooks[configs.StartContainer], configs.NewCommandHook(cmd))
                }
                for _, h := range rspec.Hooks.Poststart {
                        cmd := createCommandHook(h)
                        config.Hooks[configs.Poststart] = append(config.Hooks[configs.Poststart], configs.NewCommandHook(cmd))
                }
                for _, h := range rspec.Hooks.Poststop {
                        cmd := createCommandHook(h)
                        config.Hooks[configs.Poststop] = append(config.Hooks[configs.Poststop], configs.NewCommandHook(cmd))
                }
        }
}

func createCommandHook(h specs.Hook) *configs.Command {
        cmd := &configs.Command{
                Path: h.Path,
                Args: h.Args,
                Env:  h.Env,
        }
        if h.Timeout != nil {
                d := time.Duration(*h.Timeout) * time.Second
                cmd.Timeout = &d
        }
        return cmd
}

//go:build gofuzz
// +build gofuzz

// Copyright 2021 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package specconv

import (
        "io/ioutil"
        "os"

        "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runtime-spec/specs-go"

        gofuzzheaders "github.com/AdaLogics/go-fuzz-headers"
)

func newTestRoot(name string) (string, error) {
        dir, err := ioutil.TempDir("", name)
        if err != nil {
                return "", err
        }
        if err := os.MkdirAll(dir, 0700); err != nil {
                return "", err
        }
        return dir, nil
}

func Fuzz(data []byte) int {
        if len(data) < 30 {
                return -1
        }
        f := gofuzzheaders.NewConsumer(data)
        linuxSpec := new(specs.Linux)
        err := f.GenerateStruct(linuxSpec)
        if err != nil {
                return 0
        }

        // Create spec.Spec
        spec := new(specs.Spec)
        err = f.GenerateStruct(spec)
        if err != nil {
                return 0
        }
        spec.Linux = linuxSpec

        // Create CreateOpts
        opts := new(CreateOpts)
        err = f.GenerateStruct(opts)
        if err != nil {
                return 0
        }
        opts.Spec = spec

        config := &configs.Resources{}
        err = f.GenerateStruct(config)
        if err != nil {
                return 0
        }

        c, err := CreateCgroupConfig(opts, nil)
        if err != nil {
                return 0
        }

        path, err := newTestRoot("fuzzDir")
        if err != nil {
                return 0
        }
        um, err := systemd.NewUnifiedManager(c, path)
        if err != nil {
                return 0
        }
        err = um.Set(config)
        err = um.Apply(int(data[0]))
        err = um.Destroy()
        return 1
}

package libcontainer

import (
        "errors"
        "fmt"
        "os"
        "os/exec"

        "github.com/opencontainers/runtime-spec/specs-go"
        "github.com/opencontainers/selinux/go-selinux"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"

        "github.com/opencontainers/runc/libcontainer/apparmor"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runc/libcontainer/keys"
        "github.com/opencontainers/runc/libcontainer/seccomp"
        "github.com/opencontainers/runc/libcontainer/system"
        "github.com/opencontainers/runc/libcontainer/utils"
)

type linuxStandardInit struct {
        pipe          *syncSocket
        consoleSocket *os.File
        pidfdSocket   *os.File
        parentPid     int
        fifoFile      *os.File
        logPipe       *os.File
        config        *initConfig
}

func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
        var newperms uint32

        if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
                // With user ns we need 'other' search permissions.
                newperms = 0x8
        } else {
                // Without user ns we need 'UID' search permissions.
                newperms = 0x80000
        }

        // Create a unique per session container name that we can join in setns;
        // However, other containers can also join it.
        return "_ses." + l.config.ContainerID, 0xffffffff, newperms
}

func (l *linuxStandardInit) Init() error {
        if !l.config.Config.NoNewKeyring {
                if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
                        return err
                }
                defer selinux.SetKeyLabel("") //nolint: errcheck
                ringname, keepperms, newperms := l.getSessionRingParams()

                // Do not inherit the parent's session keyring.
                if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
                        // If keyrings aren't supported then it is likely we are on an
                        // older kernel (or inside an LXC container). While we could bail,
                        // the security feature we are using here is best-effort (it only
                        // really provides marginal protection since VFS credentials are
                        // the only significant protection of keyrings).
                        //
                        // TODO(cyphar): Log this so people know what's going on, once we
                        //               have proper logging in 'runc init'.
                        if !errors.Is(err, unix.ENOSYS) {
                                return fmt.Errorf("unable to join session keyring: %w", err)
                        }
                } else {
                        // Make session keyring searchable. If we've gotten this far we
                        // bail on any error -- we don't want to have a keyring with bad
                        // permissions.
                        if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
                                return fmt.Errorf("unable to mod keyring permissions: %w", err)
                        }
                }
        }

        if err := setupNetwork(l.config); err != nil {
                return err
        }
        if err := setupRoute(l.config.Config); err != nil {
                return err
        }

        // initialises the labeling system
        selinux.GetEnabled()

        err := prepareRootfs(l.pipe, l.config)
        if err != nil {
                return err
        }

        // Set up the console. This has to be done *before* we finalize the rootfs,
        // but *after* we've given the user the chance to set up all of the mounts
        // they wanted.
        if l.config.CreateConsole {
                if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
                        return err
                }
                if err := system.Setctty(); err != nil {
                        return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err}
                }
        }

        if l.pidfdSocket != nil {
                if err := setupPidfd(l.pidfdSocket, "standard"); err != nil {
                        return fmt.Errorf("failed to setup pidfd: %w", err)
                }
        }

        // Finish the rootfs setup.
        if l.config.Config.Namespaces.Contains(configs.NEWNS) {
                if err := finalizeRootfs(l.config.Config); err != nil {
                        return err
                }
        }

        if hostname := l.config.Config.Hostname; hostname != "" {
                if err := unix.Sethostname([]byte(hostname)); err != nil {
                        return &os.SyscallError{Syscall: "sethostname", Err: err}
                }
        }
        if domainname := l.config.Config.Domainname; domainname != "" {
                if err := unix.Setdomainname([]byte(domainname)); err != nil {
                        return &os.SyscallError{Syscall: "setdomainname", Err: err}
                }
        }
        if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
                return fmt.Errorf("unable to apply apparmor profile: %w", err)
        }

        for key, value := range l.config.Config.Sysctl {
                if err := writeSystemProperty(key, value); err != nil {
                        return err
                }
        }
        for _, path := range l.config.Config.ReadonlyPaths {
                if err := readonlyPath(path); err != nil {
                        return fmt.Errorf("can't make %q read-only: %w", path, err)
                }
        }
        for _, path := range l.config.Config.MaskPaths {
                if err := maskPath(path, l.config.Config.MountLabel); err != nil {
                        return fmt.Errorf("can't mask path %s: %w", path, err)
                }
        }
        pdeath, err := system.GetParentDeathSignal()
        if err != nil {
                return fmt.Errorf("can't get pdeath signal: %w", err)
        }
        if l.config.NoNewPrivileges {
                if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
                        return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err}
                }
        }

        if err := setupScheduler(l.config); err != nil {
                return err
        }

        if err := setupIOPriority(l.config); err != nil {
                return err
        }

        // Tell our parent that we're ready to exec. This must be done before the
        // Seccomp rules have been applied, because we need to be able to read and
        // write to a socket.
        if err := syncParentReady(l.pipe); err != nil {
                return fmt.Errorf("sync ready: %w", err)
        }
        if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
                return fmt.Errorf("can't set process label: %w", err)
        }
        defer selinux.SetExecLabel("") //nolint: errcheck
        // Without NoNewPrivileges seccomp is a privileged operation, so we need to
        // do this before dropping capabilities; otherwise do it as late as possible
        // just before execve so as few syscalls take place after it as possible.
        if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
                seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
                if err != nil {
                        return err
                }

                if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
                        return err
                }
        }
        if err := finalizeNamespace(l.config); err != nil {
                return err
        }
        // finalizeNamespace can change user/group which clears the parent death
        // signal, so we restore it here.
        if err := pdeath.Restore(); err != nil {
                return fmt.Errorf("can't restore pdeath signal: %w", err)
        }

        // In case we have any StartContainer hooks to run, and they don't
        // have environment configured explicitly, make sure they will be run
        // with the same environment as container's init.
        //
        // NOTE the above described behavior is not part of runtime-spec, but
        // rather a de facto historical thing we afraid to change.
        if h := l.config.Config.Hooks[configs.StartContainer]; len(h) > 0 {
                h.SetDefaultEnv(l.config.Env)
        }

        // Compare the parent from the initial start of the init process and make
        // sure that it did not change.  if the parent changes that means it died
        // and we were reparented to something else so we should just kill ourself
        // and not cause problems for someone else.
        if unix.Getppid() != l.parentPid {
                return unix.Kill(unix.Getpid(), unix.SIGKILL)
        }
        // Check for the arg before waiting to make sure it exists and it is
        // returned as a create time error.
        name, err := exec.LookPath(l.config.Args[0])
        if err != nil {
                return err
        }

        // Set seccomp as close to execve as possible, so as few syscalls take
        // place afterward (reducing the amount of syscalls that users need to
        // enable in their seccomp profiles). However, this needs to be done
        // before closing the pipe since we need it to pass the seccompFd to
        // the parent.
        if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
                seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
                if err != nil {
                        return fmt.Errorf("unable to init seccomp: %w", err)
                }

                if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
                        return err
                }
        }

        // Set personality if specified.
        if l.config.Config.Personality != nil {
                if err := setupPersonality(l.config.Config); err != nil {
                        return err
                }
        }

        // Close the pipe to signal that we have completed our init.
        logrus.Debugf("init: closing the pipe to signal completion")
        _ = l.pipe.Close()

        // Close the log pipe fd so the parent's ForwardLogs can exit.
        logrus.Debugf("init: about to wait on exec fifo")
        if err := l.logPipe.Close(); err != nil {
                return fmt.Errorf("close log pipe: %w", err)
        }

        fifoPath, closer := utils.ProcThreadSelfFd(l.fifoFile.Fd())
        defer closer()

        // Wait for the FIFO to be opened on the other side before exec-ing the
        // user process. We open it through /proc/self/fd/$fd, because the fd that
        // was given to us was an O_PATH fd to the fifo itself. Linux allows us to
        // re-open an O_PATH fd through /proc.
        fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
        if err != nil {
                return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
        }
        if _, err := unix.Write(fd, []byte("0")); err != nil {
                return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err}
        }

        // Close the O_PATH fifofd fd before exec because the kernel resets
        // dumpable in the wrong order. This has been fixed in newer kernels, but
        // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
        // N.B. the core issue itself (passing dirfds to the host filesystem) has
        // since been resolved.
        // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
        _ = l.fifoFile.Close()

        if s := l.config.SpecState; s != nil {
                s.Pid = unix.Getpid()
                s.Status = specs.StateCreated
                if err := l.config.Config.Hooks.Run(configs.StartContainer, s); err != nil {
                        return err
                }
        }

        // Close all file descriptors we are not passing to the container. This is
        // necessary because the execve target could use internal runc fds as the
        // execve path, potentially giving access to binary files from the host
        // (which can then be opened by container processes, leading to container
        // escapes). Note that because this operation will close any open file
        // descriptors that are referenced by (*os.File) handles from underneath
        // the Go runtime, we must not do any file operations after this point
        // (otherwise the (*os.File) finaliser could close the wrong file). See
        // CVE-2024-21626 for more information as to why this protection is
        // necessary.
        if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
                return err
        }
        return system.Exec(name, l.config.Args, l.config.Env)
}

package libcontainer

import (
        "fmt"
        "os"
        "path/filepath"

        "github.com/opencontainers/runc/libcontainer/cgroups"
        "github.com/opencontainers/runc/libcontainer/configs"
        "github.com/opencontainers/runtime-spec/specs-go"
        "golang.org/x/sys/unix"
)

func newStateTransitionError(from, to containerState) error {
        return &stateTransitionError{
                From: from.status().String(),
                To:   to.status().String(),
        }
}

// stateTransitionError is returned when an invalid state transition happens from one
// state to another.
type stateTransitionError struct {
        From string
        To   string
}

func (s *stateTransitionError) Error() string {
        return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
}

type containerState interface {
        transition(containerState) error
        destroy() error
        status() Status
}

func destroy(c *Container) error {
        // Usually, when a container init is gone, all other processes in its
        // cgroup are killed by the kernel. This is not the case for a shared
        // PID namespace container, which may have some processes left after
        // its init is killed or exited.
        //
        // As the container without init process running is considered stopped,
        // and destroy is supposed to remove all the container resources, we need
        // to kill those processes here.
        if !c.config.Namespaces.IsPrivate(configs.NEWPID) {
                // Likely to fail when c.config.RootlessCgroups is true
                _ = signalAllProcesses(c.cgroupManager, unix.SIGKILL)
        }
        if err := c.cgroupManager.Destroy(); err != nil {
                return fmt.Errorf("unable to remove container's cgroup: %w", err)
        }
        if c.intelRdtManager != nil {
                if err := c.intelRdtManager.Destroy(); err != nil {
                        return fmt.Errorf("unable to remove container's IntelRDT group: %w", err)
                }
        }
        if err := os.RemoveAll(c.stateDir); err != nil {
                return fmt.Errorf("unable to remove container state dir: %w", err)
        }
        c.initProcess = nil
        err := runPoststopHooks(c)
        c.state = &stoppedState{c: c}
        return err
}

func runPoststopHooks(c *Container) error {
        hooks := c.config.Hooks
        if hooks == nil {
                return nil
        }

        s, err := c.currentOCIState()
        if err != nil {
                return err
        }
        s.Status = specs.StateStopped

        return hooks.Run(configs.Poststop, s)
}

// stoppedState represents a container is a stopped/destroyed state.
type stoppedState struct {
        c *Container
}

func (b *stoppedState) status() Status {
        return Stopped
}

func (b *stoppedState) transition(s containerState) error {
        switch s.(type) {
        case *runningState, *restoredState:
                b.c.state = s
                return nil
        case *stoppedState:
                return nil
        }
        return newStateTransitionError(b, s)
}

func (b *stoppedState) destroy() error {
        return destroy(b.c)
}

// runningState represents a container that is currently running.
type runningState struct {
        c *Container
}

func (r *runningState) status() Status {
        return Running
}

func (r *runningState) transition(s containerState) error {
        switch s.(type) {
        case *stoppedState:
                if r.c.hasInit() {
                        return ErrRunning
                }
                r.c.state = s
                return nil
        case *pausedState:
                r.c.state = s
                return nil
        case *runningState:
                return nil
        }
        return newStateTransitionError(r, s)
}

func (r *runningState) destroy() error {
        if r.c.hasInit() {
                return ErrRunning
        }
        return destroy(r.c)
}

type createdState struct {
        c *Container
}

func (i *createdState) status() Status {
        return Created
}

func (i *createdState) transition(s containerState) error {
        switch s.(type) {
        case *runningState, *pausedState, *stoppedState:
                i.c.state = s
                return nil
        case *createdState:
                return nil
        }
        return newStateTransitionError(i, s)
}

func (i *createdState) destroy() error {
        _ = i.c.initProcess.signal(unix.SIGKILL)
        return destroy(i.c)
}

// pausedState represents a container that is currently pause.  It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
        c *Container
}

func (p *pausedState) status() Status {
        return Paused
}

func (p *pausedState) transition(s containerState) error {
        switch s.(type) {
        case *runningState, *stoppedState:
                p.c.state = s
                return nil
        case *pausedState:
                return nil
        }
        return newStateTransitionError(p, s)
}

func (p *pausedState) destroy() error {
        if p.c.hasInit() {
                return ErrPaused
        }
        if err := p.c.cgroupManager.Freeze(cgroups.Thawed); err != nil {
                return err
        }
        return destroy(p.c)
}

// restoredState is the same as the running state but also has associated checkpoint
// information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct {
        imageDir string
        c        *Container
}

func (r *restoredState) status() Status {
        return Running
}

func (r *restoredState) transition(s containerState) error {
        switch s.(type) {
        case *stoppedState, *runningState:
                return nil
        }
        return newStateTransitionError(r, s)
}

func (r *restoredState) destroy() error {
        if _, err := os.Stat(filepath.Join(r.c.stateDir, "checkpoint")); err != nil {
                if !os.IsNotExist(err) {
                        return err
                }
        }
        return destroy(r.c)
}

// loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type loadedState struct {
        c *Container
        s Status
}

func (n *loadedState) status() Status {
        return n.s
}

func (n *loadedState) transition(s containerState) error {
        n.c.state = s
        return nil
}

func (n *loadedState) destroy() error {
        if err := n.c.refreshState(); err != nil {
                return err
        }
        return n.c.state.destroy()
}

package libcontainer

import (
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "strconv"

        "github.com/opencontainers/runc/libcontainer/utils"

        "github.com/sirupsen/logrus"
)

type syncType string

// Constants that are used for synchronisation between the parent and child
// during container setup. They come in pairs (with procError being a generic
// response which is followed by an &initError).
//
//             [  child  ] <-> [   parent   ]
//
//        procMountPlease      --> [open(2) or open_tree(2) and configure mount]
//          Arg: configs.Mount
//                             <-- procMountFd
//                                   file: mountfd
//
//        procSeccomp         --> [forward fd to listenerPath]
//          file: seccomp fd
//                            --- no return synchronisation
//
//        procHooks --> [run hooks]
//                  <-- procHooksDone
//
//        procReady --> [final setup]
//                  <-- procRun
//
//        procSeccomp --> [grab seccomp fd with pidfd_getfd()]
//                    <-- procSeccompDone
const (
        procError       syncType = "procError"
        procReady       syncType = "procReady"
        procRun         syncType = "procRun"
        procHooks       syncType = "procHooks"
        procHooksDone   syncType = "procHooksDone"
        procMountPlease syncType = "procMountPlease"
        procMountFd     syncType = "procMountFd"
        procSeccomp     syncType = "procSeccomp"
        procSeccompDone syncType = "procSeccompDone"
)

type syncFlags int

const (
        syncFlagHasFd syncFlags = (1 << iota)
)

type syncT struct {
        Type  syncType         `json:"type"`
        Flags syncFlags        `json:"flags"`
        Arg   *json.RawMessage `json:"arg,omitempty"`
        File  *os.File         `json:"-"` // passed oob through SCM_RIGHTS
}

func (s syncT) String() string {
        str := "type:" + string(s.Type)
        if s.Flags != 0 {
                str += " flags:0b" + strconv.FormatInt(int64(s.Flags), 2)
        }
        if s.Arg != nil {
                str += " arg:" + string(*s.Arg)
        }
        if s.File != nil {
                str += " file:" + s.File.Name() + " (fd:" + strconv.Itoa(int(s.File.Fd())) + ")"
        }
        return str
}

// initError is used to wrap errors for passing them via JSON,
// as encoding/json can't unmarshal into error type.
type initError struct {
        Message string `json:"message,omitempty"`
}

func (i initError) Error() string {
        return i.Message
}

func doWriteSync(pipe *syncSocket, sync syncT) error {
        sync.Flags &= ^syncFlagHasFd
        if sync.File != nil {
                sync.Flags |= syncFlagHasFd
        }
        logrus.Debugf("writing sync %s", sync)
        data, err := json.Marshal(sync)
        if err != nil {
                return fmt.Errorf("marshal sync %v: %w", sync.Type, err)
        }
        if _, err := pipe.WritePacket(data); err != nil {
                return fmt.Errorf("writing sync %v: %w", sync.Type, err)
        }
        if sync.Flags&syncFlagHasFd != 0 {
                logrus.Debugf("writing sync file %s", sync)
                if err := utils.SendFile(pipe.File(), sync.File); err != nil {
                        return fmt.Errorf("sending file after sync %q: %w", sync.Type, err)
                }
        }
        return nil
}

func writeSync(pipe *syncSocket, sync syncType) error {
        return doWriteSync(pipe, syncT{Type: sync})
}

func writeSyncArg(pipe *syncSocket, sync syncType, arg interface{}) error {
        argJSON, err := json.Marshal(arg)
        if err != nil {
                return fmt.Errorf("writing sync %v: marshal argument failed: %w", sync, err)
        }
        argJSONMsg := json.RawMessage(argJSON)
        return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg})
}

func doReadSync(pipe *syncSocket) (syncT, error) {
        var sync syncT
        logrus.Debugf("reading sync")
        packet, err := pipe.ReadPacket()
        if err != nil {
                if errors.Is(err, io.EOF) {
                        logrus.Debugf("sync pipe closed")
                        return sync, err
                }
                return sync, fmt.Errorf("reading from parent failed: %w", err)
        }
        if err := json.Unmarshal(packet, &sync); err != nil {
                return sync, fmt.Errorf("unmarshal sync from parent failed: %w", err)
        }
        logrus.Debugf("read sync %s", sync)
        if sync.Type == procError {
                var ierr initError
                if sync.Arg == nil {
                        return sync, errors.New("procError missing error payload")
                }
                if err := json.Unmarshal(*sync.Arg, &ierr); err != nil {
                        return sync, fmt.Errorf("unmarshal procError failed: %w", err)
                }
                return sync, &ierr
        }
        if sync.Flags&syncFlagHasFd != 0 {
                logrus.Debugf("reading sync file %s", sync)
                file, err := utils.RecvFile(pipe.File())
                if err != nil {
                        return sync, fmt.Errorf("receiving fd from sync %v failed: %w", sync.Type, err)
                }
                sync.File = file
        }
        return sync, nil
}

func readSyncFull(pipe *syncSocket, expected syncType) (syncT, error) {
        sync, err := doReadSync(pipe)
        if err != nil {
                return sync, err
        }
        if sync.Type != expected {
                return sync, fmt.Errorf("unexpected synchronisation flag: got %q, expected %q", sync.Type, expected)
        }
        return sync, nil
}

func readSync(pipe *syncSocket, expected syncType) error {
        sync, err := readSyncFull(pipe, expected)
        if err != nil {
                return err
        }
        if sync.Arg != nil {
                return fmt.Errorf("sync %v had unexpected argument passed: %q", expected, string(*sync.Arg))
        }
        if sync.File != nil {
                _ = sync.File.Close()
                return fmt.Errorf("sync %v had unexpected file passed", sync.Type)
        }
        return nil
}

// parseSync runs the given callback function on each syncT received from the
// child. It will return once io.EOF is returned from the given pipe.
func parseSync(pipe *syncSocket, fn func(*syncT) error) error {
        for {
                sync, err := doReadSync(pipe)
                if err != nil {
                        if errors.Is(err, io.EOF) {
                                break
                        }
                        return err
                }
                if err := fn(&sync); err != nil {
                        return err
                }
        }
        return nil
}

package libcontainer

import (
        "fmt"
        "io"
        "os"
        "sync/atomic"

        "golang.org/x/sys/unix"
)

// syncSocket is a wrapper around a SOCK_SEQPACKET socket, providing
// packet-oriented methods. This is needed because SOCK_SEQPACKET does not
// allow for partial reads, but the Go stdlib treats it as a streamable source,
// which ends up making things like json.Decoder hang forever if the packet is
// bigger than the internal read buffer.
type syncSocket struct {
        f      *os.File
        closed atomic.Bool
}

func newSyncSocket(f *os.File) *syncSocket {
        return &syncSocket{f: f}
}

func (s *syncSocket) File() *os.File {
        return s.f
}

func (s *syncSocket) Close() error {
        // Even with errors from Close(), we have to assume the pipe was closed.
        s.closed.Store(true)
        return s.f.Close()
}

func (s *syncSocket) isClosed() bool {
        return s.closed.Load()
}

func (s *syncSocket) WritePacket(b []byte) (int, error) {
        return s.f.Write(b)
}

func (s *syncSocket) ReadPacket() ([]byte, error) {
        var (
                size int
                err  error
        )

        for {
                size, _, err = unix.Recvfrom(int(s.f.Fd()), nil, unix.MSG_TRUNC|unix.MSG_PEEK)
                if err != unix.EINTR { //nolint:errorlint // unix errors are bare
                        break
                }
        }

        if err != nil {
                return nil, fmt.Errorf("fetch packet length from socket: %w", os.NewSyscallError("recvfrom", err))
        }
        // We will only get a zero size if the socket has been closed from the
        // other end (otherwise recvfrom(2) will block until a packet is ready). In
        // addition, SOCK_SEQPACKET is treated as a stream source by Go stdlib so
        // returning io.EOF here is correct from that perspective too.
        if size == 0 {
                return nil, io.EOF
        }
        buf := make([]byte, size)
        n, err := s.f.Read(buf)
        if err != nil {
                return nil, err
        }
        if n != size {
                return nil, fmt.Errorf("packet read too short: expected %d byte packet but only %d bytes read", size, n)
        }
        return buf, nil
}

func (s *syncSocket) Shutdown(how int) error {
        if err := unix.Shutdown(int(s.f.Fd()), how); err != nil {
                return &os.PathError{Op: "shutdown", Path: s.f.Name() + " (sync pipe)", Err: err}
        }
        return nil
}

// newSyncSockpair returns a new SOCK_SEQPACKET unix socket pair to be used for
// runc-init synchronisation.
func newSyncSockpair(name string) (parent, child *syncSocket, err error) {
        fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
        if err != nil {
                return nil, nil, err
        }
        parentFile := os.NewFile(uintptr(fds[1]), name+"-p")
        childFile := os.NewFile(uintptr(fds[0]), name+"-c")
        return newSyncSocket(parentFile), newSyncSocket(childFile), nil
}

//go:build linux

package system

import (
        "fmt"
        "io"
        "os"
        "unsafe"

        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

type ParentDeathSignal int

func (p ParentDeathSignal) Restore() error {
        if p == 0 {
                return nil
        }
        current, err := GetParentDeathSignal()
        if err != nil {
                return err
        }
        if p == current {
                return nil
        }
        return p.Set()
}

func (p ParentDeathSignal) Set() error {
        return SetParentDeathSignal(uintptr(p))
}

func Exec(cmd string, args []string, env []string) error {
        for {
                err := unix.Exec(cmd, args, env)
                if err != unix.EINTR {
                        return &os.PathError{Op: "exec", Path: cmd, Err: err}
                }
        }
}

func SetParentDeathSignal(sig uintptr) error {
        if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
                return err
        }
        return nil
}

func GetParentDeathSignal() (ParentDeathSignal, error) {
        var sig int
        if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
                return -1, err
        }
        return ParentDeathSignal(sig), nil
}

func SetKeepCaps() error {
        if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
                return err
        }

        return nil
}

func ClearKeepCaps() error {
        if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
                return err
        }

        return nil
}

func Setctty() error {
        if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
                return err
        }
        return nil
}

// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
        return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}

// GetSubreaper returns the subreaper setting for the calling process
func GetSubreaper() (int, error) {
        var i uintptr

        if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
                return -1, err
        }

        return int(i), nil
}

func ExecutableMemfd(comment string, flags int) (*os.File, error) {
        // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
        // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
        // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
        // The original vm.memfd_noexec=2 implementation incorrectly silently
        // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
        // kernels, we will get -EACCES if we try to use MFD_EXEC with
        // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
        //
        // The upshot is we only need to retry without MFD_EXEC on -EINVAL because
        // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
        // kernels where -EINVAL is actually a security denial.
        memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
        if err == unix.EINVAL {
                memfd, err = unix.MemfdCreate(comment, flags)
        }
        if err != nil {
                if err == unix.EACCES {
                        logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
                }
                err := os.NewSyscallError("memfd_create", err)
                return nil, fmt.Errorf("failed to create executable memfd: %w", err)
        }
        return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
}

// Copy is like io.Copy except it uses sendfile(2) if the source and sink are
// both (*os.File) as an optimisation to make copies faster.
func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
        dstFile, _ := dst.(*os.File)
        srcFile, _ := src.(*os.File)

        if dstFile != nil && srcFile != nil {
                fi, err := srcFile.Stat()
                if err != nil {
                        goto fallback
                }
                size := fi.Size()
                for size > 0 {
                        n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
                        if n > 0 {
                                size -= int64(n)
                                copied += int64(n)
                        }
                        if err == unix.EINTR {
                                continue
                        }
                        if err != nil {
                                if copied == 0 {
                                        // If we haven't copied anything so far, we can safely just
                                        // fallback to io.Copy. We could always do the fallback but
                                        // it's safer to error out in the case of a partial copy
                                        // followed by an error (which should never happen).
                                        goto fallback
                                }
                                return copied, fmt.Errorf("partial sendfile copy: %w", err)
                        }
                }
                return copied, nil
        }

fallback:
        return io.Copy(dst, src)
}

// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation.
// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion.
func SetLinuxPersonality(personality int) error {
        _, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0)
        if errno != 0 {
                return &os.SyscallError{Syscall: "set_personality", Err: errno}
        }
        return nil
}

package system

import (
        "fmt"
        "os"
        "path/filepath"
        "strconv"
        "strings"
)

// State is the status of a process.
type State rune

const ( // Only values for Linux 3.14 and later are listed here
        Dead        State = 'X'
        DiskSleep   State = 'D'
        Running     State = 'R'
        Sleeping    State = 'S'
        Stopped     State = 'T'
        TracingStop State = 't'
        Zombie      State = 'Z'
        Parked      State = 'P'
        Idle        State = 'I'
)

// String forms of the state from proc(5)'s documentation for
// /proc/[pid]/status' "State" field.
func (s State) String() string {
        switch s {
        case Dead:
                return "dead"
        case DiskSleep:
                return "disk sleep"
        case Running:
                return "running"
        case Sleeping:
                return "sleeping"
        case Stopped:
                return "stopped"
        case TracingStop:
                return "tracing stop"
        case Zombie:
                return "zombie"
        case Parked:
                return "parked"
        case Idle:
                return "idle" // kernel thread
        default:
                return fmt.Sprintf("unknown (%c)", s)
        }
}

// Stat_t represents the information from /proc/[pid]/stat, as
// described in proc(5) with names based on the /proc/[pid]/status
// fields.
type Stat_t struct {
        // Name is the command run by the process.
        Name string

        // State is the state of the process.
        State State

        // StartTime is the number of clock ticks after system boot (since
        // Linux 2.6).
        StartTime uint64
}

// Stat returns a Stat_t instance for the specified process.
func Stat(pid int) (stat Stat_t, err error) {
        bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
        if err != nil {
                return stat, err
        }
        return parseStat(string(bytes))
}

func parseStat(data string) (stat Stat_t, err error) {
        // Example:
        // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
        // The fields are space-separated, see full description in proc(5).
        //
        // We are only interested in:
        //  * field 2: process name. It is the only field enclosed into
        //    parenthesis, as it can contain spaces (and parenthesis) inside.
        //  * field 3: process state, a single character (%c)
        //  * field 22: process start time, a long unsigned integer (%llu).

        // 1. Look for the first '(' and the last ')' first, what's in between is Name.
        //    We expect at least 20 fields and a space after the last one.

        const minAfterName = 20*2 + 1 // the min field is '0 '.

        first := strings.IndexByte(data, '(')
        if first < 0 || first+minAfterName >= len(data) {
                return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
        }

        last := strings.LastIndexByte(data, ')')
        if last <= first || last+minAfterName >= len(data) {
                return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
        }

        stat.Name = data[first+1 : last]

        // 2. Remove fields 1 and 2 and a space after. State is right after.
        data = data[last+2:]
        stat.State = State(data[0])

        // 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces.
        skipSpaces := 22 - 3
        for first = 0; skipSpaces > 0 && first < len(data); first++ {
                if data[first] == ' ' {
                        skipSpaces--
                }
        }
        // Now first points to StartTime; look for space right after.
        i := strings.IndexByte(data[first:], ' ')
        if i < 0 {
                return stat, fmt.Errorf("invalid stat data (too short): %q", data)
        }
        stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64)
        if err != nil {
                return stat, fmt.Errorf("invalid stat data (bad start time): %w", err)
        }

        return stat, nil
}

//go:build go1.23

package system

import (
        "syscall"
)

// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument
// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076.
func ClearRlimitNofileCache(lim *syscall.Rlimit) {
        // Ignore the return values since we only need to clean the cache,
        // the limit is going to be set via unix.Prlimit elsewhere.
        _ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim)
}

package utils

/*
 * Copyright 2016, 2017 SUSE LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import (
        "fmt"
        "os"
        "runtime"

        "golang.org/x/sys/unix"
)

// MaxNameLen is the maximum length of the name of a file descriptor being sent
// using SendFile. The name of the file handle returned by RecvFile will never be
// larger than this value.
const MaxNameLen = 4096

// oobSpace is the size of the oob slice required to store a single FD. Note
// that unix.UnixRights appears to make the assumption that fd is always int32,
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)

// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFile(socket *os.File) (_ *os.File, Err error) {
        name := make([]byte, MaxNameLen)
        oob := make([]byte, oobSpace)

        sockfd := socket.Fd()
        var (
                n, oobn int
                err     error
        )

        for {
                n, oobn, _, _, err = unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
                if err != unix.EINTR { //nolint:errorlint // unix errors are bare
                        break
                }
        }

        if err != nil {
                return nil, os.NewSyscallError("recvmsg", err)
        }
        if n >= MaxNameLen || oobn != oobSpace {
                return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
        }
        // Truncate.
        name = name[:n]
        oob = oob[:oobn]

        scms, err := unix.ParseSocketControlMessage(oob)
        if err != nil {
                return nil, err
        }

        // We cannot control how many SCM_RIGHTS we receive, and upon receiving
        // them all of the descriptors are installed in our fd table, so we need to
        // parse all of the SCM_RIGHTS we received in order to close all of the
        // descriptors on error.
        var fds []int
        defer func() {
                for i, fd := range fds {
                        if i == 0 && Err == nil {
                                // Only close the first one on error.
                                continue
                        }
                        // Always close extra ones.
                        _ = unix.Close(fd)
                }
        }()
        var lastErr error
        for _, scm := range scms {
                if scm.Header.Type == unix.SCM_RIGHTS {
                        scmFds, err := unix.ParseUnixRights(&scm)
                        if err != nil {
                                lastErr = err
                        } else {
                                fds = append(fds, scmFds...)
                        }
                }
        }
        if lastErr != nil {
                return nil, lastErr
        }

        // We do this after collecting the fds to make sure we close them all when
        // returning an error here.
        if len(scms) != 1 {
                return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
        }
        if len(fds) != 1 {
                return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
        }
        return os.NewFile(uintptr(fds[0]), string(name)), nil
}

// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
// included so that if the other end uses RecvFile, the file will have the same
// name information.
func SendFile(socket *os.File, file *os.File) error {
        name := file.Name()
        if len(name) >= MaxNameLen {
                return fmt.Errorf("sendfd: filename too long: %s", name)
        }
        err := SendRawFd(socket, name, file.Fd())
        runtime.KeepAlive(file)
        return err
}

// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
func SendRawFd(socket *os.File, msg string, fd uintptr) error {
        oob := unix.UnixRights(int(fd))
        for {
                err := unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
                if err != unix.EINTR { //nolint:errorlint // unix errors are bare
                        return os.NewSyscallError("sendmsg", err)
                }
        }
}

//go:build gofuzz
// +build gofuzz

// Copyright 2022 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package utils

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

func FuzzstripRoot(data []byte) int {
        f := fuzz.NewConsumer(data)
        root, err := f.GetString()
        if err != nil {
                return 0
        }
        path, err := f.GetString()
        if err != nil {
                return 0
        }

        _ = stripRoot(root, path)
        return 1
}

package utils

import (
        "encoding/json"
        "io"
        "os"
        "path/filepath"
        "strings"

        "golang.org/x/sys/unix"
)

const (
        exitSignalOffset = 128
)

// ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func ExitStatus(status unix.WaitStatus) int {
        if status.Signaled() {
                return exitSignalOffset + int(status.Signal())
        }
        return status.ExitStatus()
}

// WriteJSON writes the provided struct v to w using standard json marshaling
// without a trailing newline. This is used instead of json.Encoder because
// there might be a problem in json decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
func WriteJSON(w io.Writer, v interface{}) error {
        data, err := json.Marshal(v)
        if err != nil {
                return err
        }
        _, err = w.Write(data)
        return err
}

// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
        // Deal with empty strings nicely.
        if path == "" {
                return ""
        }

        // Ensure that all paths are cleaned (especially problematic ones like
        // "/../../../../../" which can cause lots of issues).

        if filepath.IsAbs(path) {
                return filepath.Clean(path)
        }

        // If the path isn't absolute, we need to do more processing to fix paths
        // such as "../../../../<etc>/some/path". We also shouldn't convert absolute
        // paths to relative ones.
        path = filepath.Clean(string(os.PathSeparator) + path)
        // This can't fail, as (by definition) all paths are relative to root.
        path, _ = filepath.Rel(string(os.PathSeparator), path)

        return path
}

// stripRoot returns the passed path, stripping the root path if it was
// (lexicially) inside it. Note that both passed paths will always be treated
// as absolute, and the returned path will also always be absolute. In
// addition, the paths are cleaned before stripping the root.
func stripRoot(root, path string) string {
        // Make the paths clean and absolute.
        root, path = CleanPath("/"+root), CleanPath("/"+path)
        switch {
        case path == root:
                path = "/"
        case root == "/":
                // do nothing
        default:
                path = strings.TrimPrefix(path, root+"/")
        }
        return CleanPath("/" + path)
}

// SearchLabels searches through a list of key=value pairs for a given key,
// returning its value, and the binary flag telling whether the key exist.
func SearchLabels(labels []string, key string) (string, bool) {
        key += "="
        for _, s := range labels {
                if val, ok := strings.CutPrefix(s, key); ok {
                        return val, true
                }
        }
        return "", false
}

// Annotations returns the bundle path and user defined annotations from the
// libcontainer state.  We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
        userAnnotations = make(map[string]string)
        for _, l := range labels {
                name, value, ok := strings.Cut(l, "=")
                if !ok {
                        continue
                }
                if name == "bundle" {
                        bundle = value
                } else {
                        userAnnotations[name] = value
                }
        }
        return
}

//go:build !windows

package utils

import (
        "fmt"
        "math"
        "os"
        "path/filepath"
        "runtime"
        "strconv"
        "strings"
        "sync"
        _ "unsafe" // for go:linkname

        securejoin "github.com/cyphar/filepath-securejoin"
        "github.com/sirupsen/logrus"
        "golang.org/x/sys/unix"
)

// EnsureProcHandle returns whether or not the given file handle is on procfs.
func EnsureProcHandle(fh *os.File) error {
        var buf unix.Statfs_t
        if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
                return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
        }
        if buf.Type != unix.PROC_SUPER_MAGIC {
                return fmt.Errorf("%s is not on procfs", fh.Name())
        }
        return nil
}

var (
        haveCloseRangeCloexecBool bool
        haveCloseRangeCloexecOnce sync.Once
)

func haveCloseRangeCloexec() bool {
        haveCloseRangeCloexecOnce.Do(func() {
                // Make sure we're not closing a random file descriptor.
                tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
                if err != nil {
                        return
                }
                defer unix.Close(tmpFd)

                err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
                // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
                // -ENOSYS and -EINVAL ultimately mean we don't have support, but any
                // other potential error would imply that even the most basic close
                // operation wouldn't work.
                haveCloseRangeCloexecBool = err == nil
        })
        return haveCloseRangeCloexecBool
}

type fdFunc func(fd int)

// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
// the current process.
func fdRangeFrom(minFd int, fn fdFunc) error {
        procSelfFd, closer := ProcThreadSelf("fd")
        defer closer()

        fdDir, err := os.Open(procSelfFd)
        if err != nil {
                return err
        }
        defer fdDir.Close()

        if err := EnsureProcHandle(fdDir); err != nil {
                return err
        }

        fdList, err := fdDir.Readdirnames(-1)
        if err != nil {
                return err
        }
        for _, fdStr := range fdList {
                fd, err := strconv.Atoi(fdStr)
                // Ignore non-numeric file names.
                if err != nil {
                        continue
                }
                // Ignore descriptors lower than our specified minimum.
                if fd < minFd {
                        continue
                }
                // Ignore the file descriptor we used for readdir, as it will be closed
                // when we return.
                if uintptr(fd) == fdDir.Fd() {
                        continue
                }
                // Run the closure.
                fn(fd)
        }
        return nil
}

// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
// equal to minFd in the current process.
func CloseExecFrom(minFd int) error {
        // Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
        if haveCloseRangeCloexec() {
                err := unix.CloseRange(uint(minFd), math.MaxInt32, unix.CLOSE_RANGE_CLOEXEC)
                if err == nil {
                        return nil
                }

                logrus.Debugf("close_range failed, closing range one at a time (error: %v)", err)

                // If close_range fails, we fall back to the standard loop.
        }
        // Otherwise, fall back to the standard loop.
        return fdRangeFrom(minFd, unix.CloseOnExec)
}

//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor

// In order to make sure we do not close the internal epoll descriptors the Go
// runtime uses, we need to ensure that we skip descriptors that match
// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
// unfortunately there's no other way to be sure we're only keeping the file
// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive

// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
// current process, except for those critical to Go's runtime (such as the
// netpoll management descriptors).
//
// NOTE: That this function is incredibly dangerous to use in most Go code, as
// closing file descriptors from underneath *os.File handles can lead to very
// bad behaviour (the closed file descriptor can be re-used and then any
// *os.File operations would apply to the wrong file). This function is only
// intended to be called from the last stage of runc init.
func UnsafeCloseFrom(minFd int) error {
        // We cannot use close_range(2) even if it is available, because we must
        // not close some file descriptors.
        return fdRangeFrom(minFd, func(fd int) {
                if runtime_IsPollDescriptor(uintptr(fd)) {
                        // These are the Go runtimes internal netpoll file descriptors.
                        // These file descriptors are operated on deep in the Go scheduler,
                        // and closing those files from underneath Go can result in panics.
                        // There is no issue with keeping them because they are not
                        // executable and are not useful to an attacker anyway. Also we
                        // don't have any choice.
                        return
                }
                // There's nothing we can do about errors from close(2), and the
                // only likely error to be seen is EBADF which indicates the fd was
                // already closed (in which case, we got what we wanted).
                _ = unix.Close(fd)
        })
}

// NewSockPair returns a new SOCK_STREAM unix socket pair.
func NewSockPair(name string) (parent, child *os.File, err error) {
        fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
        if err != nil {
                return nil, nil, err
        }
        return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}

// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
// corresponding to the unsafePath resolved within the root. Before passing the
// fd, this path is verified to have been inside the root -- so operating on it
// through the passed fdpath should be safe. Do not access this path through
// the original path strings, and do not attempt to use the pathname outside of
// the passed closure (the file handle will be freed once the closure returns).
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
        // Remove the root then forcefully resolve inside the root.
        unsafePath = stripRoot(root, unsafePath)
        path, err := securejoin.SecureJoin(root, unsafePath)
        if err != nil {
                return fmt.Errorf("resolving path inside rootfs failed: %w", err)
        }

        procSelfFd, closer := ProcThreadSelf("fd/")
        defer closer()

        // Open the target path.
        fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
        if err != nil {
                return fmt.Errorf("open o_path procfd: %w", err)
        }
        defer fh.Close()

        procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
        // Double-check the path is the one we expected.
        if realpath, err := os.Readlink(procfd); err != nil {
                return fmt.Errorf("procfd verification failed: %w", err)
        } else if realpath != path {
                return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
        }

        return fn(procfd)
}

type ProcThreadSelfCloser func()

var (
        haveProcThreadSelf     bool
        haveProcThreadSelfOnce sync.Once
)

// ProcThreadSelf returns a string that is equivalent to
// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
// meaning that the passed string needs to be trusted. The caller _must_ call
// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
// *only once* after it has finished using the returned path string.
func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
        haveProcThreadSelfOnce.Do(func() {
                if _, err := os.Stat("/proc/thread-self/"); err == nil {
                        haveProcThreadSelf = true
                } else {
                        logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
                }
        })

        // We need to lock our thread until the caller is done with the path string
        // because any non-atomic operation on the path (such as opening a file,
        // then reading it) could be interrupted by the Go runtime where the
        // underlying thread is swapped out and the original thread is killed,
        // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
        // addition, the pre-3.17 fallback makes everything non-atomic because the
        // same thing could happen between unix.Gettid() and the path operations.
        //
        // In theory, we don't need to lock in the atomic user case when using
        // /proc/thread-self/, but it's better to be safe than sorry (and there are
        // only one or two truly atomic users of /proc/thread-self/).
        runtime.LockOSThread()

        threadSelf := "/proc/thread-self/"
        if !haveProcThreadSelf {
                // Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
                threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
                if _, err := os.Stat(threadSelf); err != nil {
                        // Unfortunately, this code is called from rootfs_linux.go where we
                        // are running inside the pid namespace of the container but /proc
                        // is the host's procfs. Unfortunately there is no real way to get
                        // the correct tid to use here (the kernel age means we cannot do
                        // things like set up a private fsopen("proc") -- even scanning
                        // NSpid in all of the tasks in /proc/self/task/*/status requires
                        // Linux 4.1).
                        //
                        // So, we just have to assume that /proc/self is acceptable in this
                        // one specific case.
                        if os.Getpid() == 1 {
                                logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
                        } else {
                                // This should never happen, but the fallback should work in most cases...
                                logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
                        }
                        threadSelf = "/proc/self/"
                }
        }
        return threadSelf + subpath, runtime.UnlockOSThread
}

// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
// create a /proc/thread-self handle for given file descriptor.
//
// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
// without using fmt.Sprintf to avoid unneeded overhead.
func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
        return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
}

// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
// but properly handling the case where path or root are "/".
//
// NOTE: The return value only make sense if the path doesn't contain "..".
func IsLexicallyInRoot(root, path string) bool {
        if root != "/" {
                root += "/"
        }
        if path != "/" {
                path += "/"
        }
        return strings.HasPrefix(path, root)
}

// MkdirAllInRootOpen attempts to make
//
//        path, _ := securejoin.SecureJoin(root, unsafePath)
//        os.MkdirAll(path, mode)
//        os.Open(path)
//
// safer against attacks where components in the path are changed between
// SecureJoin returning and MkdirAll (or Open) being called. In particular, we
// try to detect any symlink components in the path while we are doing the
// MkdirAll.
//
// NOTE: If unsafePath is a subpath of root, we assume that you have already
// called SecureJoin and so we use the provided path verbatim without resolving
// any symlinks (this is done in a way that avoids symlink-exchange races).
// This means that the path also must not contain ".." elements, otherwise an
// error will occur.
//
// This uses securejoin.MkdirAllHandle under the hood, but it has special
// handling if unsafePath has already been scoped within the rootfs (this is
// needed for a lot of runc callers and fixing this would require reworking a
// lot of path logic).
func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (_ *os.File, Err error) {
        // If the path is already "within" the root, get the path relative to the
        // root and use that as the unsafe path. This is necessary because a lot of
        // MkdirAllInRootOpen callers have already done SecureJoin, and refactoring
        // all of them to stop using these SecureJoin'd paths would require a fair
        // amount of work.
        // TODO(cyphar): Do the refactor to libpathrs once it's ready.
        if IsLexicallyInRoot(root, unsafePath) {
                subPath, err := filepath.Rel(root, unsafePath)
                if err != nil {
                        return nil, err
                }
                unsafePath = subPath
        }

        // Check for any silly mode bits.
        if mode&^0o7777 != 0 {
                return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode)
        }
        // Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if
        // passed. While it would make sense to return an error in that case (since
        // the user has asked for a mode that won't be applied), for compatibility
        // reasons we have to ignore these bits.
        if ignoredBits := mode &^ 0o1777; ignoredBits != 0 {
                logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits)
                mode &= 0o1777
        }

        rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
        if err != nil {
                return nil, fmt.Errorf("open root handle: %w", err)
        }
        defer rootDir.Close()

        return securejoin.MkdirAllHandle(rootDir, unsafePath, mode)
}

// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
// returned handle, for callers that don't need to use it.
func MkdirAllInRoot(root, unsafePath string, mode os.FileMode) error {
        f, err := MkdirAllInRootOpen(root, unsafePath, mode)
        if err == nil {
                _ = f.Close()
        }
        return err
}

// Openat is a Go-friendly openat(2) wrapper.
func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
        dirFd := unix.AT_FDCWD
        if dir != nil {
                dirFd = int(dir.Fd())
        }
        flags |= unix.O_CLOEXEC

        fd, err := unix.Openat(dirFd, path, flags, mode)
        if err != nil {
                return nil, &os.PathError{Op: "openat", Path: path, Err: err}
        }
        return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
}