/src/crosvm/third_party/minijail/system.c

Source (jump to first uncovered line)
/* Copyright 2017 The ChromiumOS Authors
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "system.h"

#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <net/if.h>
#include <pwd.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/statvfs.h>
#include <unistd.h>

#include <linux/securebits.h>

#include "syscall_wrapper.h"
#include "util.h"

/*
 * SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
 * definition if the securebits header doesn't provide it.
 */
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE
#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
#endif

#ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
#endif

/*
 * Assert the value of SECURE_ALL_BITS at compile-time.
 * Android devices are currently compiled against 4.4 kernel headers. Kernel 4.3
 * added a new securebit.
 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
 * when used on older kernels. The compile-time assert will catch this situation
 * at compile time.
 */
#if defined(__ANDROID__)
_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
#endif

/* Used by lookup_(user|group) functions. */
#define MAX_PWENT_SZ (1 << 20)
#define MAX_GRENT_SZ (1 << 20)

int secure_noroot_set_and_locked(uint64_t mask)
{
  return (mask & (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED)) ==
         (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED);
}

int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
{
  /* The general idea is to set all bits, subject to exceptions below. */
  unsigned long securebits = SECURE_ALL_BITS | SECURE_ALL_LOCKS;

  /*
   * SECBIT_KEEP_CAPS is special in that it is automatically cleared on
   * execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
   * the default) in processes that have it locked already (such as nested
   * minijail usage) would fail. Thus, unless the caller requires it,
   * allow it to remain off if it is already locked.
   */
  if (!require_keep_caps) {
    int current_securebits = prctl(PR_GET_SECUREBITS);
    if (current_securebits < 0) {
      pwarn("prctl(PR_GET_SECUREBITS) failed");
      return -1;
    }

    if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
        (current_securebits & SECBIT_KEEP_CAPS) == 0) {
      securebits &= ~SECBIT_KEEP_CAPS;
    }
  }

  /*
   * Ambient capabilities can only be raised if they're already present
   * in the permitted *and* inheritable set. Therefore, we don't really
   * need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
   * configuring the permitted and inheritable set.
   */
  securebits &=
      ~(SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);

  /* Don't set any bits that the user requested not to be touched. */
  securebits &= ~skip_mask;

  if (!securebits) {
    warn("not locking any securebits");
    return 0;
  }
  int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
  if (securebits_ret < 0) {
    pwarn("prctl(PR_SET_SECUREBITS) failed");
    return -1;
  }

  return 0;
}

int write_proc_file(pid_t pid, const char *content, const char *basename)
{
  attribute_cleanup_fd int fd = -1;
  int ret;
  size_t sz, len;
  ssize_t written;
  char filename[32];

  sz = sizeof(filename);
  ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
  if (ret < 0 || (size_t)ret >= sz) {
    warn("failed to generate %s filename", basename);
    return -1;
  }

  fd = open(filename, O_WRONLY | O_CLOEXEC);
  if (fd < 0) {
    pwarn("failed to open '%s'", filename);
    return -errno;
  }

  len = strlen(content);
  written = write(fd, content, len);
  if (written < 0) {
    pwarn("failed to write '%s'", filename);
    return -errno;
  }

  if ((size_t)written < len) {
    warn("failed to write %zu bytes to '%s'", len, filename);
    return -1;
  }
  return 0;
}

/*
 * We specifically do not use cap_valid() as that only tells us the last
 * valid cap we were *compiled* against (i.e. what the version of kernel
 * headers says). If we run on a different kernel version, then it's not
 * uncommon for that to be less (if an older kernel) or more (if a newer
 * kernel).
 * Normally, we suck up the answer via /proc. On Android, not all processes are
 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
 */
unsigned int get_last_valid_cap(void)
{
  unsigned int last_valid_cap = 0;
  if (is_android()) {
    for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
         ++last_valid_cap)
      ;

    /* |last_valid_cap| will be the first failing value. */
    if (last_valid_cap > 0) {
      last_valid_cap--;
    }
  } else {
    static const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
    FILE *fp = fopen(cap_file, "re");
    if (!fp)
      pdie("fopen(%s)", cap_file);
    if (fscanf(fp, "%u", &last_valid_cap) != 1)
      pdie("fscanf(%s)", cap_file);
    fclose(fp);
  }
  /* Caps are bitfields stored in 64-bit int. */
  if (last_valid_cap > 64)
    pdie("unable to detect last valid cap: %u > 64",
         last_valid_cap);
  return last_valid_cap;
}

int cap_ambient_supported(void)
{
  return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
         0;
}

int config_net_loopback(void)
{
  const char ifname[] = "lo";
  attribute_cleanup_fd int sock = -1;
  struct ifreq ifr;

  /* Make sure people don't try to add really long names. */
  _Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");

  sock = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
  if (sock < 0) {
    pwarn("socket(AF_LOCAL) failed");
    return -1;
  }

  /*
   * Do the equiv of `ip link set up lo`.  The kernel will assign
   * IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
   */
  strcpy(ifr.ifr_name, ifname);
  if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
    pwarn("ioctl(SIOCGIFFLAGS) failed");
    return -1;
  }

  /* The kernel preserves ifr.ifr_name for use. */
  ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
  if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
    pwarn("ioctl(SIOCSIFFLAGS) failed");
    return -1;
  }

  return 0;
}

int write_pid_to_path(pid_t pid, const char *path)
{
  FILE *fp = fopen(path, "we");

  if (!fp) {
    pwarn("failed to open '%s'", path);
    return -errno;
  }
  if (fprintf(fp, "%d\n", (int)pid) < 0) {
    /* fprintf(3) does not set errno on failure. */
    warn("fprintf(%s) failed", path);
    fclose(fp);
    return -1;
  }
  if (fclose(fp)) {
    pwarn("fclose(%s) failed", path);
    return -errno;
  }

  return 0;
}

/*
 * Create the |path| directory and its parents (if need be) with |mode|.
 * If not |isdir|, then |path| is actually a file, so the last component
 * will not be created.
 */
int mkdir_p(const char *path, mode_t mode, bool isdir)
{
  int rc;
  char *dir = strdup(path);
  if (!dir) {
    rc = errno;
    pwarn("strdup(%s) failed", path);
    return -rc;
  }

  /* Starting from the root, work our way out to the end. */
  char *p = strchr(dir + 1, '/');
  while (p) {
    *p = '\0';
    if (mkdir(dir, mode) && errno != EEXIST) {
      rc = errno;
      pwarn("mkdir(%s, 0%o) failed", dir, mode);
      free(dir);
      return -rc;
    }
    *p = '/';
    p = strchr(p + 1, '/');
  }

  /*
   * Create the last directory.  We still check EEXIST here in case
   * of trailing slashes.
   */
  free(dir);
  if (isdir && mkdir(path, mode) && errno != EEXIST) {
    rc = errno;
    pwarn("mkdir(%s, 0%o) failed", path, mode);
    return -rc;
  }
  return 0;
}

/*
 * get_mount_flags: Obtain the mount flags of the mount where |source| lives.
 */
int get_mount_flags(const char *source, unsigned long *mnt_flags)
{
  if (mnt_flags) {
    struct statvfs stvfs_buf;
    int rc = statvfs(source, &stvfs_buf);
    if (rc) {
      rc = errno;
      pwarn("failed to look up mount flags: source=%s",
            source);
      return -rc;
    }
    *mnt_flags = stvfs_buf.f_flag;
  }
  return 0;
}

/*
 * setup_mount_destination: Ensures the mount target exists.
 * Creates it if needed and possible.
 */
int setup_mount_destination(const char *source, const char *dest, uid_t uid,
          uid_t gid, bool bind)
{
  int rc;
  struct stat st_buf;
  bool domkdir;

  rc = stat(dest, &st_buf);
  if (rc == 0) /* destination exists */
    return 0;

  /*
   * Try to create the destination.
   * Either make a directory or touch a file depending on the source type.
   *
   * If the source isn't an absolute path, assume it is a filesystem type
   * such as "tmpfs" and create a directory to mount it on.  The dest will
   * be something like "none" or "proc" which we shouldn't be checking.
   */
  if (source[0] == '/') {
    /* The source is an absolute path -- it better exist! */
    rc = stat(source, &st_buf);
    if (rc) {
      rc = errno;
      pwarn("stat(%s) failed", source);
      return -rc;
    }

    /*
     * If bind mounting, we only create a directory if the source
     * is a directory, else we always bind mount it as a file to
     * support device nodes, sockets, etc...
     *
     * For all other mounts, we assume a block/char source is
     * going to want a directory to mount to.  If the source is
     * something else (e.g. a fifo or socket), this probably will
     * not do the right thing, but we'll fail later on when we try
     * to mount(), so shouldn't be a big deal.
     */
    domkdir = S_ISDIR(st_buf.st_mode) ||
        (!bind && (S_ISBLK(st_buf.st_mode) ||
             S_ISCHR(st_buf.st_mode)));
  } else {
    /* The source is a relative path -- assume it's a pseudo fs. */

    /* Disallow relative bind mounts. */
    if (bind) {
      warn("relative bind-mounts are not allowed: source=%s",
           source);
      return -EINVAL;
    }

    domkdir = true;
  }

  /*
   * Now that we know what we want to do, do it!
   * We always create the intermediate dirs and the final path with 0755
   * perms and root/root ownership.  This shouldn't be a problem because
   * the actual mount will set those perms/ownership on the mount point
   * which is all people should need to access it.
   */
  rc = mkdir_p(dest, 0755, domkdir);
  if (rc)
    return rc;
  if (!domkdir) {
    attribute_cleanup_fd int fd =
        open(dest, O_RDWR | O_CREAT | O_CLOEXEC, 0700);
    if (fd < 0) {
      rc = errno;
      pwarn("open(%s) failed", dest);
      return -rc;
    }
  }
  if (chown(dest, uid, gid)) {
    rc = errno;
    pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
    return -rc;
  }
  return 0;
}

/*
 * lookup_user: Gets the uid/gid for the given username.
 */
int lookup_user(const char *user, uid_t *uid, gid_t *gid)
{
  char *buf = NULL;
  struct passwd pw;
  struct passwd *ppw = NULL;
  /*
   * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
   * a suggested starting size for the buffer, so let's try getting this
   * size first, and fallback to a default othersise.
   */
  ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
  if (sz == -1)
    sz = 65536; /* your guess is as good as mine... */

  do {
    buf = malloc(sz);
    if (!buf)
      return -ENOMEM;
    int err = getpwnam_r(user, &pw, buf, sz, &ppw);
    /*
     * We're safe to free the buffer here. The strings inside |pw|
     * point inside |buf|, but we don't use any of them; this leaves
     * the pointers dangling but it's safe.
     * |ppw| points at |pw| if getpwnam_r(3) succeeded.
     */
    free(buf);
    if (err == ERANGE) {
      /* |buf| was too small, retry with a bigger one. */
      sz <<= 1;
    } else if (err != 0) {
      /* We got an error not related to the size of |buf|. */
      return -err;
    } else if (!ppw) {
      /* Not found. */
      return -ENOENT;
    } else {
      *uid = ppw->pw_uid;
      *gid = ppw->pw_gid;
      return 0;
    }
  } while (sz <= MAX_PWENT_SZ);

  /* A buffer of size MAX_PWENT_SZ is still too small, return an error. */
  return -ERANGE;
}

/*
 * lookup_group: Gets the gid for the given group name.
 */
int lookup_group(const char *group, gid_t *gid)
{
  char *buf = NULL;
  struct group gr;
  struct group *pgr = NULL;
  /*
   * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
   * a suggested starting size for the buffer, so let's try getting this
   * size first, and fallback to a default otherwise.
   */
  ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
  if (sz == -1)
    sz = 65536; /* and mine is as good as yours, really */

  do {
    buf = malloc(sz);
    if (!buf)
      return -ENOMEM;
    int err = getgrnam_r(group, &gr, buf, sz, &pgr);
    /*
     * We're safe to free the buffer here. The strings inside |gr|
     * point inside |buf|, but we don't use any of them; this leaves
     * the pointers dangling but it's safe.
     * |pgr| points at |gr| if getgrnam_r(3) succeeded.
     */
    free(buf);
    if (err == ERANGE) {
      /* |buf| was too small, retry with a bigger one. */
      sz <<= 1;
    } else if (err != 0) {
      /* We got an error not related to the size of |buf|. */
      return -err;
    } else if (!pgr) {
      /* Not found. */
      return -ENOENT;
    } else {
      *gid = pgr->gr_gid;
      return 0;
    }
  } while (sz <= MAX_GRENT_SZ);

  /* A buffer of size MAX_GRENT_SZ is still too small, return an error. */
  return -ERANGE;
}

static bool seccomp_action_is_available(const char *wanted)
{
  if (is_android()) {
    /*
     * Accessing |actions_avail| is generating SELinux denials, so
     * skip for now.
     * TODO(crbug.com/978022, jorgelo): Remove once the denial is
     * fixed.
     */
    return false;
  }
  const char actions_avail_path[] =
      "/proc/sys/kernel/seccomp/actions_avail";
  FILE *f = fopen(actions_avail_path, "re");

  if (!f) {
    pwarn("fopen(%s) failed", actions_avail_path);
    return false;
  }

  attribute_cleanup_str char *actions_avail = NULL;
  size_t buf_size = 0;
  if (getline(&actions_avail, &buf_size, f) < 0) {
    pwarn("getline() failed");
    return false;
  }

  /*
   * This is just substring search, which means that partial matches will
   * match too (e.g. "action" would match "longaction"). There are no
   * seccomp actions which include other actions though, so we're good for
   * now. Eventually we might want to split the string by spaces.
   */
  return strstr(actions_avail, wanted) != NULL;
}

int seccomp_ret_log_available(void)
{
  static int ret_log_available = -1;

  if (ret_log_available == -1)
    ret_log_available = seccomp_action_is_available("log");

  return ret_log_available;
}

int seccomp_ret_kill_process_available(void)
{
  static int ret_kill_process_available = -1;

  if (ret_kill_process_available == -1)
    ret_kill_process_available =
        seccomp_action_is_available("kill_process");

  return ret_kill_process_available;
}

bool sys_set_no_new_privs(void)
{
  /*
   * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
   * in the kernel source tree for an explanation of the parameters.
   */
  if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0) {
    return true;
  } else {
    pwarn("prctl(PR_SET_NO_NEW_PRIVS) failed");
    return false;
  }
}

bool seccomp_filter_flags_available(unsigned int flags)
{
  return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 ||
         errno != EINVAL;
}

bool is_canonical_path(const char *path)
{
  attribute_cleanup_str char *rp = realpath(path, NULL);
  if (!rp) {
    pwarn("realpath(%s) failed", path);
    return false;
  }

  if (streq(path, rp)) {
    return true;
  }

  size_t path_len = strlen(path);
  size_t rp_len = strlen(rp);
  /* If |path| has a single trailing slash, that's OK. */
  return path_len == rp_len + 1 && strncmp(path, rp, rp_len) == 0 &&
         path[path_len - 1] == '/';
}

Coverage Report

Created: 2024-09-08 06:35

Line	Count	Source (jump to first uncovered line)
1		/* Copyright 2017 The ChromiumOS Authors
2		* Use of this source code is governed by a BSD-style license that can be
3		* found in the LICENSE file.
4		*/
5
6		#include "system.h"
7
8		#include <errno.h>
9		#include <fcntl.h>
10		#include <grp.h>
11		#include <net/if.h>
12		#include <pwd.h>
13		#include <stdbool.h>
14		#include <stdio.h>
15		#include <string.h>
16		#include <sys/ioctl.h>
17		#include <sys/prctl.h>
18		#include <sys/socket.h>
19		#include <sys/stat.h>
20		#include <sys/statvfs.h>
21		#include <unistd.h>
22
23		#include <linux/securebits.h>
24
25		#include "syscall_wrapper.h"
26		#include "util.h"
27
28		/*
29		* SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
30		* definition if the securebits header doesn't provide it.
31		*/
32		#ifndef SECBIT_NO_CAP_AMBIENT_RAISE
33		#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
34		#endif
35
36		#ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
37		#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
38		#endif
39
40		/*
41		* Assert the value of SECURE_ALL_BITS at compile-time.
42		* Android devices are currently compiled against 4.4 kernel headers. Kernel 4.3
43		* added a new securebit.
44		* When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
45		* when used on older kernels. The compile-time assert will catch this situation
46		* at compile time.
47		*/
48		#if defined(__ANDROID__)
49		_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
50		#endif
51
52		/* Used by lookup_(user\|group) functions. */
53	0	#define MAX_PWENT_SZ (1 << 20)
54	0	#define MAX_GRENT_SZ (1 << 20)
55
56		int secure_noroot_set_and_locked(uint64_t mask)
57	0	{
58	0	return (mask & (SECBIT_NOROOT \| SECBIT_NOROOT_LOCKED)) ==
59	0	(SECBIT_NOROOT \| SECBIT_NOROOT_LOCKED);
60	0	}
61
62		int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
63	0	{
64		/* The general idea is to set all bits, subject to exceptions below. */
65	0	unsigned long securebits = SECURE_ALL_BITS \| SECURE_ALL_LOCKS;
66
67		/*
68		* SECBIT_KEEP_CAPS is special in that it is automatically cleared on
69		* execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
70		* the default) in processes that have it locked already (such as nested
71		* minijail usage) would fail. Thus, unless the caller requires it,
72		* allow it to remain off if it is already locked.
73		*/
74	0	if (!require_keep_caps) {
75	0	int current_securebits = prctl(PR_GET_SECUREBITS);
76	0	if (current_securebits < 0) {
77	0	pwarn("prctl(PR_GET_SECUREBITS) failed");
78	0	return -1;
79	0	}
80
81	0	if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
82	0	(current_securebits & SECBIT_KEEP_CAPS) == 0) {
83	0	securebits &= ~SECBIT_KEEP_CAPS;
84	0	}
85	0	}
86
87		/*
88		* Ambient capabilities can only be raised if they're already present
89		* in the permitted and inheritable set. Therefore, we don't really
90		* need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
91		* configuring the permitted and inheritable set.
92		*/
93	0	securebits &=
94	0	~(SECBIT_NO_CAP_AMBIENT_RAISE \| SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);
95
96		/* Don't set any bits that the user requested not to be touched. */
97	0	securebits &= ~skip_mask;
98
99	0	if (!securebits) {
100	0	warn("not locking any securebits");
101	0	return 0;
102	0	}
103	0	int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
104	0	if (securebits_ret < 0) {
105	0	pwarn("prctl(PR_SET_SECUREBITS) failed");
106	0	return -1;
107	0	}
108
109	0	return 0;
110	0	}
111
112		int write_proc_file(pid_t pid, const char content, const char basename)
113	0	{
114	0	attribute_cleanup_fd int fd = -1;
115	0	int ret;
116	0	size_t sz, len;
117	0	ssize_t written;
118	0	char filename[32];
119
120	0	sz = sizeof(filename);
121	0	ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
122	0	if (ret < 0 \|\| (size_t)ret >= sz) {
123	0	warn("failed to generate %s filename", basename);
124	0	return -1;
125	0	}
126
127	0	fd = open(filename, O_WRONLY \| O_CLOEXEC);
128	0	if (fd < 0) {
129	0	pwarn("failed to open '%s'", filename);
130	0	return -errno;
131	0	}
132
133	0	len = strlen(content);
134	0	written = write(fd, content, len);
135	0	if (written < 0) {
136	0	pwarn("failed to write '%s'", filename);
137	0	return -errno;
138	0	}
139
140	0	if ((size_t)written < len) {
141	0	warn("failed to write %zu bytes to '%s'", len, filename);
142	0	return -1;
143	0	}
144	0	return 0;
145	0	}
146
147		/*
148		* We specifically do not use cap_valid() as that only tells us the last
149		* valid cap we were compiled against (i.e. what the version of kernel
150		* headers says). If we run on a different kernel version, then it's not
151		* uncommon for that to be less (if an older kernel) or more (if a newer
152		* kernel).
153		* Normally, we suck up the answer via /proc. On Android, not all processes are
154		* guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
155		* programmatically find the value by calling prctl(PR_CAPBSET_READ).
156		*/
157		unsigned int get_last_valid_cap(void)
158	0	{
159	0	unsigned int last_valid_cap = 0;
160	0	if (is_android()) {
161	0	for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
162	0	++last_valid_cap)
163	0	;
164
165		/* \|last_valid_cap\| will be the first failing value. */
166	0	if (last_valid_cap > 0) {
167	0	last_valid_cap--;
168	0	}
169	0	} else {
170	0	static const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
171	0	FILE *fp = fopen(cap_file, "re");
172	0	if (!fp)
173	0	pdie("fopen(%s)", cap_file);
174	0	if (fscanf(fp, "%u", &last_valid_cap) != 1)
175	0	pdie("fscanf(%s)", cap_file);
176	0	fclose(fp);
177	0	}
178		/* Caps are bitfields stored in 64-bit int. */
179	0	if (last_valid_cap > 64)
180	0	pdie("unable to detect last valid cap: %u > 64",
181	0	last_valid_cap);
182	0	return last_valid_cap;
183	0	}
184
185		int cap_ambient_supported(void)
186	0	{
187	0	return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
188	0	0;
189	0	}
190
191		int config_net_loopback(void)
192	0	{
193	0	const char ifname[] = "lo";
194	0	attribute_cleanup_fd int sock = -1;
195	0	struct ifreq ifr;
196
197		/* Make sure people don't try to add really long names. */
198	0	_Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");
199
200	0	sock = socket(AF_LOCAL, SOCK_DGRAM \| SOCK_CLOEXEC, 0);
201	0	if (sock < 0) {
202	0	pwarn("socket(AF_LOCAL) failed");
203	0	return -1;
204	0	}
205
206		/*
207		* Do the equiv of `ip link set up lo`. The kernel will assign
208		* IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
209		*/
210	0	strcpy(ifr.ifr_name, ifname);
211	0	if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
212	0	pwarn("ioctl(SIOCGIFFLAGS) failed");
213	0	return -1;
214	0	}
215
216		/* The kernel preserves ifr.ifr_name for use. */
217	0	ifr.ifr_flags \|= IFF_UP \| IFF_RUNNING;
218	0	if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
219	0	pwarn("ioctl(SIOCSIFFLAGS) failed");
220	0	return -1;
221	0	}
222
223	0	return 0;
224	0	}
225
226		int write_pid_to_path(pid_t pid, const char *path)
227	0	{
228	0	FILE *fp = fopen(path, "we");
229
230	0	if (!fp) {
231	0	pwarn("failed to open '%s'", path);
232	0	return -errno;
233	0	}
234	0	if (fprintf(fp, "%d\n", (int)pid) < 0) {
235		/* fprintf(3) does not set errno on failure. */
236	0	warn("fprintf(%s) failed", path);
237	0	fclose(fp);
238	0	return -1;
239	0	}
240	0	if (fclose(fp)) {
241	0	pwarn("fclose(%s) failed", path);
242	0	return -errno;
243	0	}
244
245	0	return 0;
246	0	}
247
248		/*
249		* Create the \|path\| directory and its parents (if need be) with \|mode\|.
250		* If not \|isdir\|, then \|path\| is actually a file, so the last component
251		* will not be created.
252		*/
253		int mkdir_p(const char *path, mode_t mode, bool isdir)
254	0	{
255	0	int rc;
256	0	char *dir = strdup(path);
257	0	if (!dir) {
258	0	rc = errno;
259	0	pwarn("strdup(%s) failed", path);
260	0	return -rc;
261	0	}
262
263		/* Starting from the root, work our way out to the end. */
264	0	char *p = strchr(dir + 1, '/');
265	0	while (p) {
266	0	*p = '\0';
267	0	if (mkdir(dir, mode) && errno != EEXIST) {
268	0	rc = errno;
269	0	pwarn("mkdir(%s, 0%o) failed", dir, mode);
270	0	free(dir);
271	0	return -rc;
272	0	}
273	0	*p = '/';
274	0	p = strchr(p + 1, '/');
275	0	}
276
277		/*
278		* Create the last directory. We still check EEXIST here in case
279		* of trailing slashes.
280		*/
281	0	free(dir);
282	0	if (isdir && mkdir(path, mode) && errno != EEXIST) {
283	0	rc = errno;
284	0	pwarn("mkdir(%s, 0%o) failed", path, mode);
285	0	return -rc;
286	0	}
287	0	return 0;
288	0	}
289
290		/*
291		* get_mount_flags: Obtain the mount flags of the mount where \|source\| lives.
292		*/
293		int get_mount_flags(const char source, unsigned long mnt_flags)
294	0	{
295	0	if (mnt_flags) {
296	0	struct statvfs stvfs_buf;
297	0	int rc = statvfs(source, &stvfs_buf);
298	0	if (rc) {
299	0	rc = errno;
300	0	pwarn("failed to look up mount flags: source=%s",
301	0	source);
302	0	return -rc;
303	0	}
304	0	*mnt_flags = stvfs_buf.f_flag;
305	0	}
306	0	return 0;
307	0	}
308
309		/*
310		* setup_mount_destination: Ensures the mount target exists.
311		* Creates it if needed and possible.
312		*/
313		int setup_mount_destination(const char source, const char dest, uid_t uid,
314		uid_t gid, bool bind)
315	0	{
316	0	int rc;
317	0	struct stat st_buf;
318	0	bool domkdir;
319
320	0	rc = stat(dest, &st_buf);
321	0	if (rc == 0) /* destination exists */
322	0	return 0;
323
324		/*
325		* Try to create the destination.
326		* Either make a directory or touch a file depending on the source type.
327		*
328		* If the source isn't an absolute path, assume it is a filesystem type
329		* such as "tmpfs" and create a directory to mount it on. The dest will
330		* be something like "none" or "proc" which we shouldn't be checking.
331		*/
332	0	if (source[0] == '/') {
333		/* The source is an absolute path -- it better exist! */
334	0	rc = stat(source, &st_buf);
335	0	if (rc) {
336	0	rc = errno;
337	0	pwarn("stat(%s) failed", source);
338	0	return -rc;
339	0	}
340
341		/*
342		* If bind mounting, we only create a directory if the source
343		* is a directory, else we always bind mount it as a file to
344		* support device nodes, sockets, etc...
345		*
346		* For all other mounts, we assume a block/char source is
347		* going to want a directory to mount to. If the source is
348		* something else (e.g. a fifo or socket), this probably will
349		* not do the right thing, but we'll fail later on when we try
350		* to mount(), so shouldn't be a big deal.
351		*/
352	0	domkdir = S_ISDIR(st_buf.st_mode) \|\|
353	0	(!bind && (S_ISBLK(st_buf.st_mode) \|\|
354	0	S_ISCHR(st_buf.st_mode)));
355	0	} else {
356		/* The source is a relative path -- assume it's a pseudo fs. */
357
358		/* Disallow relative bind mounts. */
359	0	if (bind) {
360	0	warn("relative bind-mounts are not allowed: source=%s",
361	0	source);
362	0	return -EINVAL;
363	0	}
364
365	0	domkdir = true;
366	0	}
367
368		/*
369		* Now that we know what we want to do, do it!
370		* We always create the intermediate dirs and the final path with 0755
371		* perms and root/root ownership. This shouldn't be a problem because
372		* the actual mount will set those perms/ownership on the mount point
373		* which is all people should need to access it.
374		*/
375	0	rc = mkdir_p(dest, 0755, domkdir);
376	0	if (rc)
377	0	return rc;
378	0	if (!domkdir) {
379	0	attribute_cleanup_fd int fd =
380	0	open(dest, O_RDWR \| O_CREAT \| O_CLOEXEC, 0700);
381	0	if (fd < 0) {
382	0	rc = errno;
383	0	pwarn("open(%s) failed", dest);
384	0	return -rc;
385	0	}
386	0	}
387	0	if (chown(dest, uid, gid)) {
388	0	rc = errno;
389	0	pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
390	0	return -rc;
391	0	}
392	0	return 0;
393	0	}
394
395		/*
396		* lookup_user: Gets the uid/gid for the given username.
397		*/
398		int lookup_user(const char user, uid_t uid, gid_t *gid)
399	0	{
400	0	char *buf = NULL;
401	0	struct passwd pw;
402	0	struct passwd *ppw = NULL;
403		/*
404		* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
405		* a suggested starting size for the buffer, so let's try getting this
406		* size first, and fallback to a default othersise.
407		*/
408	0	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
409	0	if (sz == -1)
410	0	sz = 65536; /* your guess is as good as mine... */
411
412	0	do {
413	0	buf = malloc(sz);
414	0	if (!buf)
415	0	return -ENOMEM;
416	0	int err = getpwnam_r(user, &pw, buf, sz, &ppw);
417		/*
418		* We're safe to free the buffer here. The strings inside \|pw\|
419		* point inside \|buf\|, but we don't use any of them; this leaves
420		* the pointers dangling but it's safe.
421		* \|ppw\| points at \|pw\| if getpwnam_r(3) succeeded.
422		*/
423	0	free(buf);
424	0	if (err == ERANGE) {
425		/* \|buf\| was too small, retry with a bigger one. */
426	0	sz <<= 1;
427	0	} else if (err != 0) {
428		/* We got an error not related to the size of \|buf\|. */
429	0	return -err;
430	0	} else if (!ppw) {
431		/* Not found. */
432	0	return -ENOENT;
433	0	} else {
434	0	*uid = ppw->pw_uid;
435	0	*gid = ppw->pw_gid;
436	0	return 0;
437	0	}
438	0	} while (sz <= MAX_PWENT_SZ);
439
440		/* A buffer of size MAX_PWENT_SZ is still too small, return an error. */
441	0	return -ERANGE;
442	0	}
443
444		/*
445		* lookup_group: Gets the gid for the given group name.
446		*/
447		int lookup_group(const char group, gid_t gid)
448	0	{
449	0	char *buf = NULL;
450	0	struct group gr;
451	0	struct group *pgr = NULL;
452		/*
453		* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
454		* a suggested starting size for the buffer, so let's try getting this
455		* size first, and fallback to a default otherwise.
456		*/
457	0	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
458	0	if (sz == -1)
459	0	sz = 65536; /* and mine is as good as yours, really */
460
461	0	do {
462	0	buf = malloc(sz);
463	0	if (!buf)
464	0	return -ENOMEM;
465	0	int err = getgrnam_r(group, &gr, buf, sz, &pgr);
466		/*
467		* We're safe to free the buffer here. The strings inside \|gr\|
468		* point inside \|buf\|, but we don't use any of them; this leaves
469		* the pointers dangling but it's safe.
470		* \|pgr\| points at \|gr\| if getgrnam_r(3) succeeded.
471		*/
472	0	free(buf);
473	0	if (err == ERANGE) {
474		/* \|buf\| was too small, retry with a bigger one. */
475	0	sz <<= 1;
476	0	} else if (err != 0) {
477		/* We got an error not related to the size of \|buf\|. */
478	0	return -err;
479	0	} else if (!pgr) {
480		/* Not found. */
481	0	return -ENOENT;
482	0	} else {
483	0	*gid = pgr->gr_gid;
484	0	return 0;
485	0	}
486	0	} while (sz <= MAX_GRENT_SZ);
487
488		/* A buffer of size MAX_GRENT_SZ is still too small, return an error. */
489	0	return -ERANGE;
490	0	}
491
492		static bool seccomp_action_is_available(const char *wanted)
493	0	{
494	0	if (is_android()) {
495		/*
496		* Accessing \|actions_avail\| is generating SELinux denials, so
497		* skip for now.
498		* TODO(crbug.com/978022, jorgelo): Remove once the denial is
499		* fixed.
500		*/
501	0	return false;
502	0	}
503	0	const char actions_avail_path[] =
504	0	"/proc/sys/kernel/seccomp/actions_avail";
505	0	FILE *f = fopen(actions_avail_path, "re");
506
507	0	if (!f) {
508	0	pwarn("fopen(%s) failed", actions_avail_path);
509	0	return false;
510	0	}
511
512	0	attribute_cleanup_str char *actions_avail = NULL;
513	0	size_t buf_size = 0;
514	0	if (getline(&actions_avail, &buf_size, f) < 0) {
515	0	pwarn("getline() failed");
516	0	return false;
517	0	}
518
519		/*
520		* This is just substring search, which means that partial matches will
521		* match too (e.g. "action" would match "longaction"). There are no
522		* seccomp actions which include other actions though, so we're good for
523		* now. Eventually we might want to split the string by spaces.
524		*/
525	0	return strstr(actions_avail, wanted) != NULL;
526	0	}
527
528		int seccomp_ret_log_available(void)
529	0	{
530	0	static int ret_log_available = -1;
531
532	0	if (ret_log_available == -1)
533	0	ret_log_available = seccomp_action_is_available("log");
534
535	0	return ret_log_available;
536	0	}
537
538		int seccomp_ret_kill_process_available(void)
539	0	{
540	0	static int ret_kill_process_available = -1;
541
542	0	if (ret_kill_process_available == -1)
543	0	ret_kill_process_available =
544	0	seccomp_action_is_available("kill_process");
545
546	0	return ret_kill_process_available;
547	0	}
548
549		bool sys_set_no_new_privs(void)
550	0	{
551		/*
552		* Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
553		* in the kernel source tree for an explanation of the parameters.
554		*/
555	0	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0) {
556	0	return true;
557	0	} else {
558	0	pwarn("prctl(PR_SET_NO_NEW_PRIVS) failed");
559	0	return false;
560	0	}
561	0	}
562
563		bool seccomp_filter_flags_available(unsigned int flags)
564	0	{
565	0	return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 \|\|
566	0	errno != EINVAL;
567	0	}
568
569		bool is_canonical_path(const char *path)
570	0	{
571	0	attribute_cleanup_str char *rp = realpath(path, NULL);
572	0	if (!rp) {
573	0	pwarn("realpath(%s) failed", path);
574	0	return false;
575	0	}
576
577	0	if (streq(path, rp)) {
578	0	return true;
579	0	}
580
581	0	size_t path_len = strlen(path);
582	0	size_t rp_len = strlen(rp);
583		/* If \|path\| has a single trailing slash, that's OK. */
584	0	return path_len == rp_len + 1 && strncmp(path, rp, rp_len) == 0 &&
585	0	path[path_len - 1] == '/';
586	0	}