/src/crosvm/third_party/minijail/system.c

Source
/* Copyright 2017 The ChromiumOS Authors
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "system.h"

#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <net/if.h>
#include <pwd.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <unistd.h>

#include <linux/securebits.h>

#include "syscall_wrapper.h"
#include "util.h"

/*
 * SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
 * definition if the securebits header doesn't provide it.
 */
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE
#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
#endif

#ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
#endif

/*
 * SECBIT_EXEC_RESTRICT_FILE was added in kernel 6.14, so fill in the
 * definition if the securebits header doesn't provide it.
 */
#ifndef SECBIT_EXEC_RESTRICT_FILE
#define SECBIT_EXEC_RESTRICT_FILE (issecure_mask(8))
#endif

#ifndef SECBIT_EXEC_RESTRICT_FILE_LOCKED
#define SECBIT_EXEC_RESTRICT_FILE_LOCKED (issecure_mask(9))
#endif

/*
 * SECBIT_EXEC_DENY_INTERACTIVE was added in kernel 6.14, so fill in the
 * definition if the securebits header doesn't provide it.
 */
#ifndef SECBIT_EXEC_DENY_INTERACTIVE
#define SECBIT_EXEC_DENY_INTERACTIVE (issecure_mask(10))
#endif

#ifndef SECBIT_EXEC_DENY_INTERACTIVE_LOCKED
#define SECBIT_EXEC_DENY_INTERACTIVE_LOCKED (issecure_mask(11))
#endif

/*
 * Assert the value of SECURE_ALL_BITS at compile-time to detect a change in
 * the set of secure bits coming from the kernel headers.
 * Kernel 6.14 introduced new secure bits that need to be removed when
 * running on older kernels. An older kernel can be detected when the
 * prctl(PR_SET_SECUREBITS, ...) fails with errno set to EPERM.
 * When this is detected, remove the new bits and try the prctl call again.
 */
#if defined(__ANDROID__)
_Static_assert(SECURE_ALL_BITS == 0x555, "SECURE_ALL_BITS == 0x555.");
#endif

#define SECURE_BITS_6_14                                                       \
  (SECBIT_EXEC_RESTRICT_FILE | SECBIT_EXEC_DENY_INTERACTIVE)
#define SECURE_LOCK_BITS_6_14                                                  \
  (SECBIT_EXEC_RESTRICT_FILE_LOCKED | SECBIT_EXEC_DENY_INTERACTIVE_LOCKED)

/* Used by lookup_(user|group) functions. */
#define MAX_PWENT_SZ (1 << 20)
#define MAX_GRENT_SZ (1 << 20)

int secure_noroot_set_and_locked(uint64_t mask)
{
  return (mask & (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED)) ==
         (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED);
}

int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
{
  /* The general idea is to set all bits, subject to exceptions below. */
  unsigned long securebits = SECURE_ALL_BITS | SECURE_ALL_LOCKS;

  /*
   * SECBIT_KEEP_CAPS is special in that it is automatically cleared on
   * execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
   * the default) in processes that have it locked already (such as nested
   * minijail usage) would fail. Thus, unless the caller requires it,
   * allow it to remain off if it is already locked.
   */
  if (!require_keep_caps) {
    int current_securebits = prctl(PR_GET_SECUREBITS);
    if (current_securebits < 0) {
      pwarn("prctl(PR_GET_SECUREBITS) failed");
      return -1;
    }

    if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
        (current_securebits & SECBIT_KEEP_CAPS) == 0) {
      securebits &= ~SECBIT_KEEP_CAPS;
    }
  }

  /*
   * Ambient capabilities can only be raised if they're already present
   * in the permitted *and* inheritable set. Therefore, we don't really
   * need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
   * configuring the permitted and inheritable set.
   */
  securebits &=
      ~(SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);

  /* Don't set any bits that the user requested not to be touched. */
  securebits &= ~skip_mask;

  if (!securebits) {
    warn("not locking any securebits");
    return 0;
  }
  int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
  if (securebits_ret < 0) {
    if (errno == EPERM &&
        (securebits & (SECURE_BITS_6_14 | SECURE_LOCK_BITS_6_14)) !=
      0) {
      /* Possibly running on kernel < 6.14. */
      securebits &=
          ~(SECURE_BITS_6_14 | SECURE_LOCK_BITS_6_14);
      securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
    }
    if (securebits_ret < 0) {
      pwarn("prctl(PR_SET_SECUREBITS) failed");
      return -1;
    }
  }

  return 0;
}

int write_proc_file(pid_t pid, const char *content, const char *basename)
{
  attribute_cleanup_fd int fd = -1;
  int ret;
  size_t sz, len;
  ssize_t written;
  char filename[32];

  sz = sizeof(filename);
  ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
  if (ret < 0 || (size_t)ret >= sz) {
    warn("failed to generate %s filename", basename);
    return -1;
  }

  fd = open(filename, O_WRONLY | O_CLOEXEC);
  if (fd < 0) {
    pwarn("failed to open '%s'", filename);
    return -errno;
  }

  len = strlen(content);
  written = write(fd, content, len);
  if (written < 0) {
    pwarn("failed to write '%s'", filename);
    return -errno;
  }

  if ((size_t)written < len) {
    warn("failed to write %zu bytes to '%s'", len, filename);
    return -1;
  }
  return 0;
}

/*
 * We specifically do not use cap_valid() as that only tells us the last
 * valid cap we were *compiled* against (i.e. what the version of kernel
 * headers says). If we run on a different kernel version, then it's not
 * uncommon for that to be less (if an older kernel) or more (if a newer
 * kernel).
 * Normally, we suck up the answer via /proc. On Android, not all processes are
 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
 */
unsigned int get_last_valid_cap(void)
{
  unsigned int last_valid_cap = 0;
  if (is_android()) {
    for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
         ++last_valid_cap)
      ;

    /* |last_valid_cap| will be the first failing value. */
    if (last_valid_cap > 0) {
      last_valid_cap--;
    }
  } else {
    static const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
    FILE *fp = fopen(cap_file, "re");
    if (!fp)
      pdie("fopen(%s)", cap_file);
    if (fscanf(fp, "%u", &last_valid_cap) != 1)
      pdie("fscanf(%s)", cap_file);
    fclose(fp);
  }
  /* Caps are bitfields stored in 64-bit int. */
  if (last_valid_cap > 64)
    pdie("unable to detect last valid cap: %u > 64",
         last_valid_cap);
  return last_valid_cap;
}

int cap_ambient_supported(void)
{
  return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
         0;
}

int config_net_loopback(void)
{
  const char ifname[] = "lo";
  attribute_cleanup_fd int sock = -1;
  struct ifreq ifr;

  /* Make sure people don't try to add really long names. */
  _Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");

  sock = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
  if (sock < 0) {
    pwarn("socket(AF_LOCAL) failed");
    return -1;
  }

  /*
   * Do the equiv of `ip link set up lo`.  The kernel will assign
   * IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
   */
  strcpy(ifr.ifr_name, ifname);
  if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
    pwarn("ioctl(SIOCGIFFLAGS) failed");
    return -1;
  }

  /* The kernel preserves ifr.ifr_name for use. */
  ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
  if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
    pwarn("ioctl(SIOCSIFFLAGS) failed");
    return -1;
  }

  return 0;
}

int write_pid_to_path(pid_t pid, const char *path)
{
  FILE *fp = fopen(path, "we");

  if (!fp) {
    pwarn("failed to open '%s'", path);
    return -errno;
  }
  if (fprintf(fp, "%d\n", (int)pid) < 0) {
    /* fprintf(3) does not set errno on failure. */
    warn("fprintf(%s) failed", path);
    fclose(fp);
    return -1;
  }
  if (fclose(fp)) {
    pwarn("fclose(%s) failed", path);
    return -errno;
  }

  return 0;
}

/*
 * Create the |path| directory and its parents (if need be) with |mode|.
 * If not |isdir|, then |path| is actually a file, so the last component
 * will not be created.
 */
int mkdir_p(const char *path, mode_t mode, bool isdir)
{
  int rc;
  char *dir = strdup(path);
  if (!dir) {
    rc = errno;
    pwarn("strdup(%s) failed", path);
    return -rc;
  }

  /* Starting from the root, work our way out to the end. */
  char *p = strchr(dir + 1, '/');
  while (p) {
    *p = '\0';
    if (mkdir(dir, mode) && errno != EEXIST) {
      rc = errno;
      pwarn("mkdir(%s, 0%o) failed", dir, mode);
      free(dir);
      return -rc;
    }
    *p = '/';
    p = strchr(p + 1, '/');
  }

  /*
   * Create the last directory.  We still check EEXIST here in case
   * of trailing slashes.
   */
  free(dir);
  if (isdir && mkdir(path, mode) && errno != EEXIST) {
    rc = errno;
    pwarn("mkdir(%s, 0%o) failed", path, mode);
    return -rc;
  }
  return 0;
}

/*
 * get_mount_flags: Obtain the mount flags of the mount where |source| lives.
 */
int get_mount_flags(const char *source, unsigned long *mnt_flags)
{
  if (mnt_flags) {
    struct statvfs stvfs_buf;
    int rc = statvfs(source, &stvfs_buf);
    if (rc) {
      rc = errno;
      pwarn("failed to look up mount flags: source=%s",
            source);
      return -rc;
    }
    *mnt_flags = stvfs_buf.f_flag;
  }
  return 0;
}

/*
 * setup_mount_destination: Ensures the mount target exists.
 * Creates it if needed and possible.
 */
int setup_mount_destination(const char *source, const char *dest, uid_t uid,
          uid_t gid, bool bind)
{
  int rc;
  struct stat st_buf;
  bool domkdir;

  rc = stat(dest, &st_buf);
  if (rc == 0) /* destination exists */
    return 0;

  /*
   * Try to create the destination.
   * Either make a directory or touch a file depending on the source type.
   *
   * If the source isn't an absolute path, assume it is a filesystem type
   * such as "tmpfs" and create a directory to mount it on.  The dest will
   * be something like "none" or "proc" which we shouldn't be checking.
   */
  if (source[0] == '/') {
    /* The source is an absolute path -- it better exist! */
    rc = stat(source, &st_buf);
    if (rc) {
      rc = errno;
      pwarn("stat(%s) failed", source);
      return -rc;
    }

    /*
     * If bind mounting, we only create a directory if the source
     * is a directory, else we always bind mount it as a file to
     * support device nodes, sockets, etc...
     *
     * For all other mounts, we assume a block/char source is
     * going to want a directory to mount to.  If the source is
     * something else (e.g. a fifo or socket), this probably will
     * not do the right thing, but we'll fail later on when we try
     * to mount(), so shouldn't be a big deal.
     */
    domkdir = S_ISDIR(st_buf.st_mode) ||
        (!bind && (S_ISBLK(st_buf.st_mode) ||
             S_ISCHR(st_buf.st_mode)));
  } else {
    /* The source is a relative path -- assume it's a pseudo fs. */

    /* Disallow relative bind mounts. */
    if (bind) {
      warn("relative bind-mounts are not allowed: source=%s",
           source);
      return -EINVAL;
    }

    domkdir = true;
  }

  /*
   * Now that we know what we want to do, do it!
   * We always create the intermediate dirs and the final path with 0755
   * perms and root/root ownership.  This shouldn't be a problem because
   * the actual mount will set those perms/ownership on the mount point
   * which is all people should need to access it.
   */
  rc = mkdir_p(dest, 0755, domkdir);
  if (rc)
    return rc;
  if (!domkdir) {
    attribute_cleanup_fd int fd =
        open(dest, O_RDWR | O_CREAT | O_CLOEXEC, 0700);
    if (fd < 0) {
      rc = errno;
      pwarn("open(%s) failed", dest);
      return -rc;
    }
  }
  if (chown(dest, uid, gid)) {
    rc = errno;
    pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
    return -rc;
  }
  return 0;
}

/*
 * lookup_user: Gets the uid/gid for the given username.
 */
int lookup_user(const char *user, uid_t *uid, gid_t *gid)
{
  char *buf = NULL;
  struct passwd pw;
  struct passwd *ppw = NULL;
  /*
   * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
   * a suggested starting size for the buffer, so let's try getting this
   * size first, and fallback to a default othersise.
   */
  ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
  if (sz == -1)
    sz = 65536; /* your guess is as good as mine... */

  do {
    buf = malloc(sz);
    if (!buf)
      return -ENOMEM;
    int err = getpwnam_r(user, &pw, buf, sz, &ppw);
    /*
     * We're safe to free the buffer here. The strings inside |pw|
     * point inside |buf|, but we don't use any of them; this leaves
     * the pointers dangling but it's safe.
     * |ppw| points at |pw| if getpwnam_r(3) succeeded.
     */
    free(buf);
    if (err == ERANGE) {
      /* |buf| was too small, retry with a bigger one. */
      sz <<= 1;
    } else if (err != 0) {
      /* We got an error not related to the size of |buf|. */
      return -err;
    } else if (!ppw) {
      /* Not found. */
      return -ENOENT;
    } else {
      *uid = ppw->pw_uid;
      *gid = ppw->pw_gid;
      return 0;
    }
  } while (sz <= MAX_PWENT_SZ);

  /* A buffer of size MAX_PWENT_SZ is still too small, return an error. */
  return -ERANGE;
}

/*
 * lookup_group: Gets the gid for the given group name.
 */
int lookup_group(const char *group, gid_t *gid)
{
  char *buf = NULL;
  struct group gr;
  struct group *pgr = NULL;
  /*
   * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
   * a suggested starting size for the buffer, so let's try getting this
   * size first, and fallback to a default otherwise.
   */
  ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
  if (sz == -1)
    sz = 65536; /* and mine is as good as yours, really */

  do {
    buf = malloc(sz);
    if (!buf)
      return -ENOMEM;
    int err = getgrnam_r(group, &gr, buf, sz, &pgr);
    /*
     * We're safe to free the buffer here. The strings inside |gr|
     * point inside |buf|, but we don't use any of them; this leaves
     * the pointers dangling but it's safe.
     * |pgr| points at |gr| if getgrnam_r(3) succeeded.
     */
    free(buf);
    if (err == ERANGE) {
      /* |buf| was too small, retry with a bigger one. */
      sz <<= 1;
    } else if (err != 0) {
      /* We got an error not related to the size of |buf|. */
      return -err;
    } else if (!pgr) {
      /* Not found. */
      return -ENOENT;
    } else {
      *gid = pgr->gr_gid;
      return 0;
    }
  } while (sz <= MAX_GRENT_SZ);

  /* A buffer of size MAX_GRENT_SZ is still too small, return an error. */
  return -ERANGE;
}

static bool seccomp_action_is_available(const char *wanted)
{
  if (is_android()) {
    /*
     * Accessing |actions_avail| is generating SELinux denials, so
     * skip for now.
     * TODO(crbug.com/978022, jorgelo): Remove once the denial is
     * fixed.
     */
    return false;
  }
  static const char actions_avail_path[] =
      "/proc/sys/kernel/seccomp/actions_avail";
  attribute_cleanup_fp FILE *f = fopen(actions_avail_path, "re");

  if (!f) {
    pwarn("fopen(%s) failed", actions_avail_path);
    return false;
  }

  attribute_cleanup_str char *actions_avail = NULL;
  size_t buf_size = 0;
  if (getline(&actions_avail, &buf_size, f) < 0) {
    pwarn("getline() failed");
    return false;
  }

  /*
   * This is just substring search, which means that partial matches will
   * match too (e.g. "action" would match "longaction"). There are no
   * seccomp actions which include other actions though, so we're good for
   * now. Eventually we might want to split the string by spaces.
   */
  return strstr(actions_avail, wanted) != NULL;
}

int seccomp_ret_log_available(void)
{
  static int ret_log_available = -1;

  if (ret_log_available == -1)
    ret_log_available = seccomp_action_is_available("log");

  return ret_log_available;
}

int seccomp_ret_kill_process_available(void)
{
  static int ret_kill_process_available = -1;

  if (ret_kill_process_available == -1)
    ret_kill_process_available =
        seccomp_action_is_available("kill_process");

  return ret_kill_process_available;
}

bool sys_set_no_new_privs(void)
{
  /*
   * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
   * in the kernel source tree for an explanation of the parameters.
   */
  if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0) {
    return true;
  } else {
    pwarn("prctl(PR_SET_NO_NEW_PRIVS) failed");
    return false;
  }
}

bool seccomp_filter_flags_available(unsigned int flags)
{
  return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 ||
         errno != EINVAL;
}

bool is_canonical_path(const char *path)
{
  attribute_cleanup_str char *rp = realpath(path, NULL);
  if (!rp) {
    pwarn("realpath(%s) failed", path);
    return false;
  }

  if (streq(path, rp)) {
    return true;
  }

  size_t path_len = strlen(path);
  size_t rp_len = strlen(rp);
  /* If |path| has a single trailing slash, that's OK. */
  return path_len == rp_len + 1 && strncmp(path, rp, rp_len) == 0 &&
         path[path_len - 1] == '/';
}

Coverage Report

Created: 2025-12-31 06:16

Line	Count	Source
1		/* Copyright 2017 The ChromiumOS Authors
2		* Use of this source code is governed by a BSD-style license that can be
3		* found in the LICENSE file.
4		*/
5
6		#include "system.h"
7
8		#include <errno.h>
9		#include <fcntl.h>
10		#include <grp.h>
11		#include <net/if.h>
12		#include <pwd.h>
13		#include <stdbool.h>
14		#include <stdio.h>
15		#include <string.h>
16		#include <sys/ioctl.h>
17		#include <sys/prctl.h>
18		#include <sys/socket.h>
19		#include <sys/stat.h>
20		#include <sys/statvfs.h>
21		#include <unistd.h>
22
23		#include <linux/securebits.h>
24
25		#include "syscall_wrapper.h"
26		#include "util.h"
27
28		/*
29		* SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
30		* definition if the securebits header doesn't provide it.
31		*/
32		#ifndef SECBIT_NO_CAP_AMBIENT_RAISE
33		#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
34		#endif
35
36		#ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
37		#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
38		#endif
39
40		/*
41		* SECBIT_EXEC_RESTRICT_FILE was added in kernel 6.14, so fill in the
42		* definition if the securebits header doesn't provide it.
43		*/
44		#ifndef SECBIT_EXEC_RESTRICT_FILE
45	0	#define SECBIT_EXEC_RESTRICT_FILE (issecure_mask(8))
46		#endif
47
48		#ifndef SECBIT_EXEC_RESTRICT_FILE_LOCKED
49	0	#define SECBIT_EXEC_RESTRICT_FILE_LOCKED (issecure_mask(9))
50		#endif
51
52		/*
53		* SECBIT_EXEC_DENY_INTERACTIVE was added in kernel 6.14, so fill in the
54		* definition if the securebits header doesn't provide it.
55		*/
56		#ifndef SECBIT_EXEC_DENY_INTERACTIVE
57	0	#define SECBIT_EXEC_DENY_INTERACTIVE (issecure_mask(10))
58		#endif
59
60		#ifndef SECBIT_EXEC_DENY_INTERACTIVE_LOCKED
61	0	#define SECBIT_EXEC_DENY_INTERACTIVE_LOCKED (issecure_mask(11))
62		#endif
63
64		/*
65		* Assert the value of SECURE_ALL_BITS at compile-time to detect a change in
66		* the set of secure bits coming from the kernel headers.
67		* Kernel 6.14 introduced new secure bits that need to be removed when
68		* running on older kernels. An older kernel can be detected when the
69		* prctl(PR_SET_SECUREBITS, ...) fails with errno set to EPERM.
70		* When this is detected, remove the new bits and try the prctl call again.
71		*/
72		#if defined(__ANDROID__)
73		_Static_assert(SECURE_ALL_BITS == 0x555, "SECURE_ALL_BITS == 0x555.");
74		#endif
75
76		#define SECURE_BITS_6_14 \
77	0	(SECBIT_EXEC_RESTRICT_FILE \| SECBIT_EXEC_DENY_INTERACTIVE)
78		#define SECURE_LOCK_BITS_6_14 \
79	0	(SECBIT_EXEC_RESTRICT_FILE_LOCKED \| SECBIT_EXEC_DENY_INTERACTIVE_LOCKED)
80
81		/* Used by lookup_(user\|group) functions. */
82	0	#define MAX_PWENT_SZ (1 << 20)
83	0	#define MAX_GRENT_SZ (1 << 20)
84
85		int secure_noroot_set_and_locked(uint64_t mask)
86	0	{
87	0	return (mask & (SECBIT_NOROOT \| SECBIT_NOROOT_LOCKED)) ==
88	0	(SECBIT_NOROOT \| SECBIT_NOROOT_LOCKED);
89	0	}
90
91		int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
92	0	{
93		/* The general idea is to set all bits, subject to exceptions below. */
94	0	unsigned long securebits = SECURE_ALL_BITS \| SECURE_ALL_LOCKS;
95
96		/*
97		* SECBIT_KEEP_CAPS is special in that it is automatically cleared on
98		* execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
99		* the default) in processes that have it locked already (such as nested
100		* minijail usage) would fail. Thus, unless the caller requires it,
101		* allow it to remain off if it is already locked.
102		*/
103	0	if (!require_keep_caps) {
104	0	int current_securebits = prctl(PR_GET_SECUREBITS);
105	0	if (current_securebits < 0) {
106	0	pwarn("prctl(PR_GET_SECUREBITS) failed");
107	0	return -1;
108	0	}
109
110	0	if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
111	0	(current_securebits & SECBIT_KEEP_CAPS) == 0) {
112	0	securebits &= ~SECBIT_KEEP_CAPS;
113	0	}
114	0	}
115
116		/*
117		* Ambient capabilities can only be raised if they're already present
118		* in the permitted and inheritable set. Therefore, we don't really
119		* need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
120		* configuring the permitted and inheritable set.
121		*/
122	0	securebits &=
123	0	~(SECBIT_NO_CAP_AMBIENT_RAISE \| SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);
124
125		/* Don't set any bits that the user requested not to be touched. */
126	0	securebits &= ~skip_mask;
127
128	0	if (!securebits) {
129	0	warn("not locking any securebits");
130	0	return 0;
131	0	}
132	0	int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
133	0	if (securebits_ret < 0) {
134	0	if (errno == EPERM &&
135	0	(securebits & (SECURE_BITS_6_14 \| SECURE_LOCK_BITS_6_14)) !=
136	0	0) {
137		/* Possibly running on kernel < 6.14. */
138	0	securebits &=
139	0	~(SECURE_BITS_6_14 \| SECURE_LOCK_BITS_6_14);
140	0	securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
141	0	}
142	0	if (securebits_ret < 0) {
143	0	pwarn("prctl(PR_SET_SECUREBITS) failed");
144	0	return -1;
145	0	}
146	0	}
147
148	0	return 0;
149	0	}
150
151		int write_proc_file(pid_t pid, const char content, const char basename)
152	0	{
153	0	attribute_cleanup_fd int fd = -1;
154	0	int ret;
155	0	size_t sz, len;
156	0	ssize_t written;
157	0	char filename[32];
158
159	0	sz = sizeof(filename);
160	0	ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
161	0	if (ret < 0 \|\| (size_t)ret >= sz) {
162	0	warn("failed to generate %s filename", basename);
163	0	return -1;
164	0	}
165
166	0	fd = open(filename, O_WRONLY \| O_CLOEXEC);
167	0	if (fd < 0) {
168	0	pwarn("failed to open '%s'", filename);
169	0	return -errno;
170	0	}
171
172	0	len = strlen(content);
173	0	written = write(fd, content, len);
174	0	if (written < 0) {
175	0	pwarn("failed to write '%s'", filename);
176	0	return -errno;
177	0	}
178
179	0	if ((size_t)written < len) {
180	0	warn("failed to write %zu bytes to '%s'", len, filename);
181	0	return -1;
182	0	}
183	0	return 0;
184	0	}
185
186		/*
187		* We specifically do not use cap_valid() as that only tells us the last
188		* valid cap we were compiled against (i.e. what the version of kernel
189		* headers says). If we run on a different kernel version, then it's not
190		* uncommon for that to be less (if an older kernel) or more (if a newer
191		* kernel).
192		* Normally, we suck up the answer via /proc. On Android, not all processes are
193		* guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
194		* programmatically find the value by calling prctl(PR_CAPBSET_READ).
195		*/
196		unsigned int get_last_valid_cap(void)
197	0	{
198	0	unsigned int last_valid_cap = 0;
199	0	if (is_android()) {
200	0	for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
201	0	++last_valid_cap)
202	0	;
203
204		/* \|last_valid_cap\| will be the first failing value. */
205	0	if (last_valid_cap > 0) {
206	0	last_valid_cap--;
207	0	}
208	0	} else {
209	0	static const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
210	0	FILE *fp = fopen(cap_file, "re");
211	0	if (!fp)
212	0	pdie("fopen(%s)", cap_file);
213	0	if (fscanf(fp, "%u", &last_valid_cap) != 1)
214	0	pdie("fscanf(%s)", cap_file);
215	0	fclose(fp);
216	0	}
217		/* Caps are bitfields stored in 64-bit int. */
218	0	if (last_valid_cap > 64)
219	0	pdie("unable to detect last valid cap: %u > 64",
220	0	last_valid_cap);
221	0	return last_valid_cap;
222	0	}
223
224		int cap_ambient_supported(void)
225	0	{
226	0	return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
227	0	0;
228	0	}
229
230		int config_net_loopback(void)
231	0	{
232	0	const char ifname[] = "lo";
233	0	attribute_cleanup_fd int sock = -1;
234	0	struct ifreq ifr;
235
236		/* Make sure people don't try to add really long names. */
237	0	_Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");
238
239	0	sock = socket(AF_LOCAL, SOCK_DGRAM \| SOCK_CLOEXEC, 0);
240	0	if (sock < 0) {
241	0	pwarn("socket(AF_LOCAL) failed");
242	0	return -1;
243	0	}
244
245		/*
246		* Do the equiv of `ip link set up lo`. The kernel will assign
247		* IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
248		*/
249	0	strcpy(ifr.ifr_name, ifname);
250	0	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
251	0	pwarn("ioctl(SIOCGIFFLAGS) failed");
252	0	return -1;
253	0	}
254
255		/* The kernel preserves ifr.ifr_name for use. */
256	0	ifr.ifr_flags \|= IFF_UP \| IFF_RUNNING;
257	0	if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
258	0	pwarn("ioctl(SIOCSIFFLAGS) failed");
259	0	return -1;
260	0	}
261
262	0	return 0;
263	0	}
264
265		int write_pid_to_path(pid_t pid, const char *path)
266	0	{
267	0	FILE *fp = fopen(path, "we");
268
269	0	if (!fp) {
270	0	pwarn("failed to open '%s'", path);
271	0	return -errno;
272	0	}
273	0	if (fprintf(fp, "%d\n", (int)pid) < 0) {
274		/* fprintf(3) does not set errno on failure. */
275	0	warn("fprintf(%s) failed", path);
276	0	fclose(fp);
277	0	return -1;
278	0	}
279	0	if (fclose(fp)) {
280	0	pwarn("fclose(%s) failed", path);
281	0	return -errno;
282	0	}
283
284	0	return 0;
285	0	}
286
287		/*
288		* Create the \|path\| directory and its parents (if need be) with \|mode\|.
289		* If not \|isdir\|, then \|path\| is actually a file, so the last component
290		* will not be created.
291		*/
292		int mkdir_p(const char *path, mode_t mode, bool isdir)
293	0	{
294	0	int rc;
295	0	char *dir = strdup(path);
296	0	if (!dir) {
297	0	rc = errno;
298	0	pwarn("strdup(%s) failed", path);
299	0	return -rc;
300	0	}
301
302		/* Starting from the root, work our way out to the end. */
303	0	char *p = strchr(dir + 1, '/');
304	0	while (p) {
305	0	*p = '\0';
306	0	if (mkdir(dir, mode) && errno != EEXIST) {
307	0	rc = errno;
308	0	pwarn("mkdir(%s, 0%o) failed", dir, mode);
309	0	free(dir);
310	0	return -rc;
311	0	}
312	0	*p = '/';
313	0	p = strchr(p + 1, '/');
314	0	}
315
316		/*
317		* Create the last directory. We still check EEXIST here in case
318		* of trailing slashes.
319		*/
320	0	free(dir);
321	0	if (isdir && mkdir(path, mode) && errno != EEXIST) {
322	0	rc = errno;
323	0	pwarn("mkdir(%s, 0%o) failed", path, mode);
324	0	return -rc;
325	0	}
326	0	return 0;
327	0	}
328
329		/*
330		* get_mount_flags: Obtain the mount flags of the mount where \|source\| lives.
331		*/
332		int get_mount_flags(const char source, unsigned long mnt_flags)
333	0	{
334	0	if (mnt_flags) {
335	0	struct statvfs stvfs_buf;
336	0	int rc = statvfs(source, &stvfs_buf);
337	0	if (rc) {
338	0	rc = errno;
339	0	pwarn("failed to look up mount flags: source=%s",
340	0	source);
341	0	return -rc;
342	0	}
343	0	*mnt_flags = stvfs_buf.f_flag;
344	0	}
345	0	return 0;
346	0	}
347
348		/*
349		* setup_mount_destination: Ensures the mount target exists.
350		* Creates it if needed and possible.
351		*/
352		int setup_mount_destination(const char source, const char dest, uid_t uid,
353		uid_t gid, bool bind)
354	0	{
355	0	int rc;
356	0	struct stat st_buf;
357	0	bool domkdir;
358
359	0	rc = stat(dest, &st_buf);
360	0	if (rc == 0) /* destination exists */
361	0	return 0;
362
363		/*
364		* Try to create the destination.
365		* Either make a directory or touch a file depending on the source type.
366		*
367		* If the source isn't an absolute path, assume it is a filesystem type
368		* such as "tmpfs" and create a directory to mount it on. The dest will
369		* be something like "none" or "proc" which we shouldn't be checking.
370		*/
371	0	if (source[0] == '/') {
372		/* The source is an absolute path -- it better exist! */
373	0	rc = stat(source, &st_buf);
374	0	if (rc) {
375	0	rc = errno;
376	0	pwarn("stat(%s) failed", source);
377	0	return -rc;
378	0	}
379
380		/*
381		* If bind mounting, we only create a directory if the source
382		* is a directory, else we always bind mount it as a file to
383		* support device nodes, sockets, etc...
384		*
385		* For all other mounts, we assume a block/char source is
386		* going to want a directory to mount to. If the source is
387		* something else (e.g. a fifo or socket), this probably will
388		* not do the right thing, but we'll fail later on when we try
389		* to mount(), so shouldn't be a big deal.
390		*/
391	0	domkdir = S_ISDIR(st_buf.st_mode) \|\|
392	0	(!bind && (S_ISBLK(st_buf.st_mode) \|\|
393	0	S_ISCHR(st_buf.st_mode)));
394	0	} else {
395		/* The source is a relative path -- assume it's a pseudo fs. */
396
397		/* Disallow relative bind mounts. */
398	0	if (bind) {
399	0	warn("relative bind-mounts are not allowed: source=%s",
400	0	source);
401	0	return -EINVAL;
402	0	}
403
404	0	domkdir = true;
405	0	}
406
407		/*
408		* Now that we know what we want to do, do it!
409		* We always create the intermediate dirs and the final path with 0755
410		* perms and root/root ownership. This shouldn't be a problem because
411		* the actual mount will set those perms/ownership on the mount point
412		* which is all people should need to access it.
413		*/
414	0	rc = mkdir_p(dest, 0755, domkdir);
415	0	if (rc)
416	0	return rc;
417	0	if (!domkdir) {
418	0	attribute_cleanup_fd int fd =
419	0	open(dest, O_RDWR \| O_CREAT \| O_CLOEXEC, 0700);
420	0	if (fd < 0) {
421	0	rc = errno;
422	0	pwarn("open(%s) failed", dest);
423	0	return -rc;
424	0	}
425	0	}
426	0	if (chown(dest, uid, gid)) {
427	0	rc = errno;
428	0	pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
429	0	return -rc;
430	0	}
431	0	return 0;
432	0	}
433
434		/*
435		* lookup_user: Gets the uid/gid for the given username.
436		*/
437		int lookup_user(const char user, uid_t uid, gid_t *gid)
438	0	{
439	0	char *buf = NULL;
440	0	struct passwd pw;
441	0	struct passwd *ppw = NULL;
442		/*
443		* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
444		* a suggested starting size for the buffer, so let's try getting this
445		* size first, and fallback to a default othersise.
446		*/
447	0	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
448	0	if (sz == -1)
449	0	sz = 65536; /* your guess is as good as mine... */
450
451	0	do {
452	0	buf = malloc(sz);
453	0	if (!buf)
454	0	return -ENOMEM;
455	0	int err = getpwnam_r(user, &pw, buf, sz, &ppw);
456		/*
457		* We're safe to free the buffer here. The strings inside \|pw\|
458		* point inside \|buf\|, but we don't use any of them; this leaves
459		* the pointers dangling but it's safe.
460		* \|ppw\| points at \|pw\| if getpwnam_r(3) succeeded.
461		*/
462	0	free(buf);
463	0	if (err == ERANGE) {
464		/* \|buf\| was too small, retry with a bigger one. */
465	0	sz <<= 1;
466	0	} else if (err != 0) {
467		/* We got an error not related to the size of \|buf\|. */
468	0	return -err;
469	0	} else if (!ppw) {
470		/* Not found. */
471	0	return -ENOENT;
472	0	} else {
473	0	*uid = ppw->pw_uid;
474	0	*gid = ppw->pw_gid;
475	0	return 0;
476	0	}
477	0	} while (sz <= MAX_PWENT_SZ);
478
479		/* A buffer of size MAX_PWENT_SZ is still too small, return an error. */
480	0	return -ERANGE;
481	0	}
482
483		/*
484		* lookup_group: Gets the gid for the given group name.
485		*/
486		int lookup_group(const char group, gid_t gid)
487	0	{
488	0	char *buf = NULL;
489	0	struct group gr;
490	0	struct group *pgr = NULL;
491		/*
492		* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
493		* a suggested starting size for the buffer, so let's try getting this
494		* size first, and fallback to a default otherwise.
495		*/
496	0	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
497	0	if (sz == -1)
498	0	sz = 65536; /* and mine is as good as yours, really */
499
500	0	do {
501	0	buf = malloc(sz);
502	0	if (!buf)
503	0	return -ENOMEM;
504	0	int err = getgrnam_r(group, &gr, buf, sz, &pgr);
505		/*
506		* We're safe to free the buffer here. The strings inside \|gr\|
507		* point inside \|buf\|, but we don't use any of them; this leaves
508		* the pointers dangling but it's safe.
509		* \|pgr\| points at \|gr\| if getgrnam_r(3) succeeded.
510		*/
511	0	free(buf);
512	0	if (err == ERANGE) {
513		/* \|buf\| was too small, retry with a bigger one. */
514	0	sz <<= 1;
515	0	} else if (err != 0) {
516		/* We got an error not related to the size of \|buf\|. */
517	0	return -err;
518	0	} else if (!pgr) {
519		/* Not found. */
520	0	return -ENOENT;
521	0	} else {
522	0	*gid = pgr->gr_gid;
523	0	return 0;
524	0	}
525	0	} while (sz <= MAX_GRENT_SZ);
526
527		/* A buffer of size MAX_GRENT_SZ is still too small, return an error. */
528	0	return -ERANGE;
529	0	}
530
531		static bool seccomp_action_is_available(const char *wanted)
532	0	{
533	0	if (is_android()) {
534		/*
535		* Accessing \|actions_avail\| is generating SELinux denials, so
536		* skip for now.
537		* TODO(crbug.com/978022, jorgelo): Remove once the denial is
538		* fixed.
539		*/
540	0	return false;
541	0	}
542	0	static const char actions_avail_path[] =
543	0	"/proc/sys/kernel/seccomp/actions_avail";
544	0	attribute_cleanup_fp FILE *f = fopen(actions_avail_path, "re");
545
546	0	if (!f) {
547	0	pwarn("fopen(%s) failed", actions_avail_path);
548	0	return false;
549	0	}
550
551	0	attribute_cleanup_str char *actions_avail = NULL;
552	0	size_t buf_size = 0;
553	0	if (getline(&actions_avail, &buf_size, f) < 0) {
554	0	pwarn("getline() failed");
555	0	return false;
556	0	}
557
558		/*
559		* This is just substring search, which means that partial matches will
560		* match too (e.g. "action" would match "longaction"). There are no
561		* seccomp actions which include other actions though, so we're good for
562		* now. Eventually we might want to split the string by spaces.
563		*/
564	0	return strstr(actions_avail, wanted) != NULL;
565	0	}
566
567		int seccomp_ret_log_available(void)
568	0	{
569	0	static int ret_log_available = -1;
570
571	0	if (ret_log_available == -1)
572	0	ret_log_available = seccomp_action_is_available("log");
573
574	0	return ret_log_available;
575	0	}
576
577		int seccomp_ret_kill_process_available(void)
578	0	{
579	0	static int ret_kill_process_available = -1;
580
581	0	if (ret_kill_process_available == -1)
582	0	ret_kill_process_available =
583	0	seccomp_action_is_available("kill_process");
584
585	0	return ret_kill_process_available;
586	0	}
587
588		bool sys_set_no_new_privs(void)
589	0	{
590		/*
591		* Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
592		* in the kernel source tree for an explanation of the parameters.
593		*/
594	0	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0) {
595	0	return true;
596	0	} else {
597	0	pwarn("prctl(PR_SET_NO_NEW_PRIVS) failed");
598	0	return false;
599	0	}
600	0	}
601
602		bool seccomp_filter_flags_available(unsigned int flags)
603	0	{
604	0	return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 \|\|
605	0	errno != EINVAL;
606	0	}
607
608		bool is_canonical_path(const char *path)
609	0	{
610	0	attribute_cleanup_str char *rp = realpath(path, NULL);
611	0	if (!rp) {
612	0	pwarn("realpath(%s) failed", path);
613	0	return false;
614	0	}
615
616	0	if (streq(path, rp)) {
617	0	return true;
618	0	}
619
620	0	size_t path_len = strlen(path);
621	0	size_t rp_len = strlen(rp);
622		/* If \|path\| has a single trailing slash, that's OK. */
623	0	return path_len == rp_len + 1 && strncmp(path, rp, rp_len) == 0 &&
624	0	path[path_len - 1] == '/';
625	0	}