Coverage Report

Created: 2025-12-31 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/crosvm/third_party/minijail/system.c
Line
Count
Source
1
/* Copyright 2017 The ChromiumOS Authors
2
 * Use of this source code is governed by a BSD-style license that can be
3
 * found in the LICENSE file.
4
 */
5
6
#include "system.h"
7
8
#include <errno.h>
9
#include <fcntl.h>
10
#include <grp.h>
11
#include <net/if.h>
12
#include <pwd.h>
13
#include <stdbool.h>
14
#include <stdio.h>
15
#include <string.h>
16
#include <sys/ioctl.h>
17
#include <sys/prctl.h>
18
#include <sys/socket.h>
19
#include <sys/stat.h>
20
#include <sys/statvfs.h>
21
#include <unistd.h>
22
23
#include <linux/securebits.h>
24
25
#include "syscall_wrapper.h"
26
#include "util.h"
27
28
/*
29
 * SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
30
 * definition if the securebits header doesn't provide it.
31
 */
32
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE
33
#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
34
#endif
35
36
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
37
#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
38
#endif
39
40
/*
41
 * SECBIT_EXEC_RESTRICT_FILE was added in kernel 6.14, so fill in the
42
 * definition if the securebits header doesn't provide it.
43
 */
44
#ifndef SECBIT_EXEC_RESTRICT_FILE
45
0
#define SECBIT_EXEC_RESTRICT_FILE (issecure_mask(8))
46
#endif
47
48
#ifndef SECBIT_EXEC_RESTRICT_FILE_LOCKED
49
0
#define SECBIT_EXEC_RESTRICT_FILE_LOCKED (issecure_mask(9))
50
#endif
51
52
/*
53
 * SECBIT_EXEC_DENY_INTERACTIVE was added in kernel 6.14, so fill in the
54
 * definition if the securebits header doesn't provide it.
55
 */
56
#ifndef SECBIT_EXEC_DENY_INTERACTIVE
57
0
#define SECBIT_EXEC_DENY_INTERACTIVE (issecure_mask(10))
58
#endif
59
60
#ifndef SECBIT_EXEC_DENY_INTERACTIVE_LOCKED
61
0
#define SECBIT_EXEC_DENY_INTERACTIVE_LOCKED (issecure_mask(11))
62
#endif
63
64
/*
65
 * Assert the value of SECURE_ALL_BITS at compile-time to detect a change in
66
 * the set of secure bits coming from the kernel headers.
67
 * Kernel 6.14 introduced new secure bits that need to be removed when
68
 * running on older kernels. An older kernel can be detected when the
69
 * prctl(PR_SET_SECUREBITS, ...) fails with errno set to EPERM.
70
 * When this is detected, remove the new bits and try the prctl call again.
71
 */
72
#if defined(__ANDROID__)
73
_Static_assert(SECURE_ALL_BITS == 0x555, "SECURE_ALL_BITS == 0x555.");
74
#endif
75
76
#define SECURE_BITS_6_14                                                       \
77
0
  (SECBIT_EXEC_RESTRICT_FILE | SECBIT_EXEC_DENY_INTERACTIVE)
78
#define SECURE_LOCK_BITS_6_14                                                  \
79
0
  (SECBIT_EXEC_RESTRICT_FILE_LOCKED | SECBIT_EXEC_DENY_INTERACTIVE_LOCKED)
80
81
/* Used by lookup_(user|group) functions. */
82
0
#define MAX_PWENT_SZ (1 << 20)
83
0
#define MAX_GRENT_SZ (1 << 20)
84
85
int secure_noroot_set_and_locked(uint64_t mask)
86
0
{
87
0
  return (mask & (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED)) ==
88
0
         (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED);
89
0
}
90
91
int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
92
0
{
93
  /* The general idea is to set all bits, subject to exceptions below. */
94
0
  unsigned long securebits = SECURE_ALL_BITS | SECURE_ALL_LOCKS;
95
96
  /*
97
   * SECBIT_KEEP_CAPS is special in that it is automatically cleared on
98
   * execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
99
   * the default) in processes that have it locked already (such as nested
100
   * minijail usage) would fail. Thus, unless the caller requires it,
101
   * allow it to remain off if it is already locked.
102
   */
103
0
  if (!require_keep_caps) {
104
0
    int current_securebits = prctl(PR_GET_SECUREBITS);
105
0
    if (current_securebits < 0) {
106
0
      pwarn("prctl(PR_GET_SECUREBITS) failed");
107
0
      return -1;
108
0
    }
109
110
0
    if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
111
0
        (current_securebits & SECBIT_KEEP_CAPS) == 0) {
112
0
      securebits &= ~SECBIT_KEEP_CAPS;
113
0
    }
114
0
  }
115
116
  /*
117
   * Ambient capabilities can only be raised if they're already present
118
   * in the permitted *and* inheritable set. Therefore, we don't really
119
   * need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
120
   * configuring the permitted and inheritable set.
121
   */
122
0
  securebits &=
123
0
      ~(SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);
124
125
  /* Don't set any bits that the user requested not to be touched. */
126
0
  securebits &= ~skip_mask;
127
128
0
  if (!securebits) {
129
0
    warn("not locking any securebits");
130
0
    return 0;
131
0
  }
132
0
  int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
133
0
  if (securebits_ret < 0) {
134
0
    if (errno == EPERM &&
135
0
        (securebits & (SECURE_BITS_6_14 | SECURE_LOCK_BITS_6_14)) !=
136
0
      0) {
137
      /* Possibly running on kernel < 6.14. */
138
0
      securebits &=
139
0
          ~(SECURE_BITS_6_14 | SECURE_LOCK_BITS_6_14);
140
0
      securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
141
0
    }
142
0
    if (securebits_ret < 0) {
143
0
      pwarn("prctl(PR_SET_SECUREBITS) failed");
144
0
      return -1;
145
0
    }
146
0
  }
147
148
0
  return 0;
149
0
}
150
151
int write_proc_file(pid_t pid, const char *content, const char *basename)
152
0
{
153
0
  attribute_cleanup_fd int fd = -1;
154
0
  int ret;
155
0
  size_t sz, len;
156
0
  ssize_t written;
157
0
  char filename[32];
158
159
0
  sz = sizeof(filename);
160
0
  ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
161
0
  if (ret < 0 || (size_t)ret >= sz) {
162
0
    warn("failed to generate %s filename", basename);
163
0
    return -1;
164
0
  }
165
166
0
  fd = open(filename, O_WRONLY | O_CLOEXEC);
167
0
  if (fd < 0) {
168
0
    pwarn("failed to open '%s'", filename);
169
0
    return -errno;
170
0
  }
171
172
0
  len = strlen(content);
173
0
  written = write(fd, content, len);
174
0
  if (written < 0) {
175
0
    pwarn("failed to write '%s'", filename);
176
0
    return -errno;
177
0
  }
178
179
0
  if ((size_t)written < len) {
180
0
    warn("failed to write %zu bytes to '%s'", len, filename);
181
0
    return -1;
182
0
  }
183
0
  return 0;
184
0
}
185
186
/*
187
 * We specifically do not use cap_valid() as that only tells us the last
188
 * valid cap we were *compiled* against (i.e. what the version of kernel
189
 * headers says). If we run on a different kernel version, then it's not
190
 * uncommon for that to be less (if an older kernel) or more (if a newer
191
 * kernel).
192
 * Normally, we suck up the answer via /proc. On Android, not all processes are
193
 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
194
 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
195
 */
196
unsigned int get_last_valid_cap(void)
197
0
{
198
0
  unsigned int last_valid_cap = 0;
199
0
  if (is_android()) {
200
0
    for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
201
0
         ++last_valid_cap)
202
0
      ;
203
204
    /* |last_valid_cap| will be the first failing value. */
205
0
    if (last_valid_cap > 0) {
206
0
      last_valid_cap--;
207
0
    }
208
0
  } else {
209
0
    static const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
210
0
    FILE *fp = fopen(cap_file, "re");
211
0
    if (!fp)
212
0
      pdie("fopen(%s)", cap_file);
213
0
    if (fscanf(fp, "%u", &last_valid_cap) != 1)
214
0
      pdie("fscanf(%s)", cap_file);
215
0
    fclose(fp);
216
0
  }
217
  /* Caps are bitfields stored in 64-bit int. */
218
0
  if (last_valid_cap > 64)
219
0
    pdie("unable to detect last valid cap: %u > 64",
220
0
         last_valid_cap);
221
0
  return last_valid_cap;
222
0
}
223
224
int cap_ambient_supported(void)
225
0
{
226
0
  return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
227
0
         0;
228
0
}
229
230
int config_net_loopback(void)
231
0
{
232
0
  const char ifname[] = "lo";
233
0
  attribute_cleanup_fd int sock = -1;
234
0
  struct ifreq ifr;
235
236
  /* Make sure people don't try to add really long names. */
237
0
  _Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");
238
239
0
  sock = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
240
0
  if (sock < 0) {
241
0
    pwarn("socket(AF_LOCAL) failed");
242
0
    return -1;
243
0
  }
244
245
  /*
246
   * Do the equiv of `ip link set up lo`.  The kernel will assign
247
   * IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
248
   */
249
0
  strcpy(ifr.ifr_name, ifname);
250
0
  if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
251
0
    pwarn("ioctl(SIOCGIFFLAGS) failed");
252
0
    return -1;
253
0
  }
254
255
  /* The kernel preserves ifr.ifr_name for use. */
256
0
  ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
257
0
  if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
258
0
    pwarn("ioctl(SIOCSIFFLAGS) failed");
259
0
    return -1;
260
0
  }
261
262
0
  return 0;
263
0
}
264
265
int write_pid_to_path(pid_t pid, const char *path)
266
0
{
267
0
  FILE *fp = fopen(path, "we");
268
269
0
  if (!fp) {
270
0
    pwarn("failed to open '%s'", path);
271
0
    return -errno;
272
0
  }
273
0
  if (fprintf(fp, "%d\n", (int)pid) < 0) {
274
    /* fprintf(3) does not set errno on failure. */
275
0
    warn("fprintf(%s) failed", path);
276
0
    fclose(fp);
277
0
    return -1;
278
0
  }
279
0
  if (fclose(fp)) {
280
0
    pwarn("fclose(%s) failed", path);
281
0
    return -errno;
282
0
  }
283
284
0
  return 0;
285
0
}
286
287
/*
288
 * Create the |path| directory and its parents (if need be) with |mode|.
289
 * If not |isdir|, then |path| is actually a file, so the last component
290
 * will not be created.
291
 */
292
int mkdir_p(const char *path, mode_t mode, bool isdir)
293
0
{
294
0
  int rc;
295
0
  char *dir = strdup(path);
296
0
  if (!dir) {
297
0
    rc = errno;
298
0
    pwarn("strdup(%s) failed", path);
299
0
    return -rc;
300
0
  }
301
302
  /* Starting from the root, work our way out to the end. */
303
0
  char *p = strchr(dir + 1, '/');
304
0
  while (p) {
305
0
    *p = '\0';
306
0
    if (mkdir(dir, mode) && errno != EEXIST) {
307
0
      rc = errno;
308
0
      pwarn("mkdir(%s, 0%o) failed", dir, mode);
309
0
      free(dir);
310
0
      return -rc;
311
0
    }
312
0
    *p = '/';
313
0
    p = strchr(p + 1, '/');
314
0
  }
315
316
  /*
317
   * Create the last directory.  We still check EEXIST here in case
318
   * of trailing slashes.
319
   */
320
0
  free(dir);
321
0
  if (isdir && mkdir(path, mode) && errno != EEXIST) {
322
0
    rc = errno;
323
0
    pwarn("mkdir(%s, 0%o) failed", path, mode);
324
0
    return -rc;
325
0
  }
326
0
  return 0;
327
0
}
328
329
/*
330
 * get_mount_flags: Obtain the mount flags of the mount where |source| lives.
331
 */
332
int get_mount_flags(const char *source, unsigned long *mnt_flags)
333
0
{
334
0
  if (mnt_flags) {
335
0
    struct statvfs stvfs_buf;
336
0
    int rc = statvfs(source, &stvfs_buf);
337
0
    if (rc) {
338
0
      rc = errno;
339
0
      pwarn("failed to look up mount flags: source=%s",
340
0
            source);
341
0
      return -rc;
342
0
    }
343
0
    *mnt_flags = stvfs_buf.f_flag;
344
0
  }
345
0
  return 0;
346
0
}
347
348
/*
349
 * setup_mount_destination: Ensures the mount target exists.
350
 * Creates it if needed and possible.
351
 */
352
int setup_mount_destination(const char *source, const char *dest, uid_t uid,
353
          uid_t gid, bool bind)
354
0
{
355
0
  int rc;
356
0
  struct stat st_buf;
357
0
  bool domkdir;
358
359
0
  rc = stat(dest, &st_buf);
360
0
  if (rc == 0) /* destination exists */
361
0
    return 0;
362
363
  /*
364
   * Try to create the destination.
365
   * Either make a directory or touch a file depending on the source type.
366
   *
367
   * If the source isn't an absolute path, assume it is a filesystem type
368
   * such as "tmpfs" and create a directory to mount it on.  The dest will
369
   * be something like "none" or "proc" which we shouldn't be checking.
370
   */
371
0
  if (source[0] == '/') {
372
    /* The source is an absolute path -- it better exist! */
373
0
    rc = stat(source, &st_buf);
374
0
    if (rc) {
375
0
      rc = errno;
376
0
      pwarn("stat(%s) failed", source);
377
0
      return -rc;
378
0
    }
379
380
    /*
381
     * If bind mounting, we only create a directory if the source
382
     * is a directory, else we always bind mount it as a file to
383
     * support device nodes, sockets, etc...
384
     *
385
     * For all other mounts, we assume a block/char source is
386
     * going to want a directory to mount to.  If the source is
387
     * something else (e.g. a fifo or socket), this probably will
388
     * not do the right thing, but we'll fail later on when we try
389
     * to mount(), so shouldn't be a big deal.
390
     */
391
0
    domkdir = S_ISDIR(st_buf.st_mode) ||
392
0
        (!bind && (S_ISBLK(st_buf.st_mode) ||
393
0
             S_ISCHR(st_buf.st_mode)));
394
0
  } else {
395
    /* The source is a relative path -- assume it's a pseudo fs. */
396
397
    /* Disallow relative bind mounts. */
398
0
    if (bind) {
399
0
      warn("relative bind-mounts are not allowed: source=%s",
400
0
           source);
401
0
      return -EINVAL;
402
0
    }
403
404
0
    domkdir = true;
405
0
  }
406
407
  /*
408
   * Now that we know what we want to do, do it!
409
   * We always create the intermediate dirs and the final path with 0755
410
   * perms and root/root ownership.  This shouldn't be a problem because
411
   * the actual mount will set those perms/ownership on the mount point
412
   * which is all people should need to access it.
413
   */
414
0
  rc = mkdir_p(dest, 0755, domkdir);
415
0
  if (rc)
416
0
    return rc;
417
0
  if (!domkdir) {
418
0
    attribute_cleanup_fd int fd =
419
0
        open(dest, O_RDWR | O_CREAT | O_CLOEXEC, 0700);
420
0
    if (fd < 0) {
421
0
      rc = errno;
422
0
      pwarn("open(%s) failed", dest);
423
0
      return -rc;
424
0
    }
425
0
  }
426
0
  if (chown(dest, uid, gid)) {
427
0
    rc = errno;
428
0
    pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
429
0
    return -rc;
430
0
  }
431
0
  return 0;
432
0
}
433
434
/*
435
 * lookup_user: Gets the uid/gid for the given username.
436
 */
437
int lookup_user(const char *user, uid_t *uid, gid_t *gid)
438
0
{
439
0
  char *buf = NULL;
440
0
  struct passwd pw;
441
0
  struct passwd *ppw = NULL;
442
  /*
443
   * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
444
   * a suggested starting size for the buffer, so let's try getting this
445
   * size first, and fallback to a default othersise.
446
   */
447
0
  ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
448
0
  if (sz == -1)
449
0
    sz = 65536; /* your guess is as good as mine... */
450
451
0
  do {
452
0
    buf = malloc(sz);
453
0
    if (!buf)
454
0
      return -ENOMEM;
455
0
    int err = getpwnam_r(user, &pw, buf, sz, &ppw);
456
    /*
457
     * We're safe to free the buffer here. The strings inside |pw|
458
     * point inside |buf|, but we don't use any of them; this leaves
459
     * the pointers dangling but it's safe.
460
     * |ppw| points at |pw| if getpwnam_r(3) succeeded.
461
     */
462
0
    free(buf);
463
0
    if (err == ERANGE) {
464
      /* |buf| was too small, retry with a bigger one. */
465
0
      sz <<= 1;
466
0
    } else if (err != 0) {
467
      /* We got an error not related to the size of |buf|. */
468
0
      return -err;
469
0
    } else if (!ppw) {
470
      /* Not found. */
471
0
      return -ENOENT;
472
0
    } else {
473
0
      *uid = ppw->pw_uid;
474
0
      *gid = ppw->pw_gid;
475
0
      return 0;
476
0
    }
477
0
  } while (sz <= MAX_PWENT_SZ);
478
479
  /* A buffer of size MAX_PWENT_SZ is still too small, return an error. */
480
0
  return -ERANGE;
481
0
}
482
483
/*
484
 * lookup_group: Gets the gid for the given group name.
485
 */
486
int lookup_group(const char *group, gid_t *gid)
487
0
{
488
0
  char *buf = NULL;
489
0
  struct group gr;
490
0
  struct group *pgr = NULL;
491
  /*
492
   * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
493
   * a suggested starting size for the buffer, so let's try getting this
494
   * size first, and fallback to a default otherwise.
495
   */
496
0
  ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
497
0
  if (sz == -1)
498
0
    sz = 65536; /* and mine is as good as yours, really */
499
500
0
  do {
501
0
    buf = malloc(sz);
502
0
    if (!buf)
503
0
      return -ENOMEM;
504
0
    int err = getgrnam_r(group, &gr, buf, sz, &pgr);
505
    /*
506
     * We're safe to free the buffer here. The strings inside |gr|
507
     * point inside |buf|, but we don't use any of them; this leaves
508
     * the pointers dangling but it's safe.
509
     * |pgr| points at |gr| if getgrnam_r(3) succeeded.
510
     */
511
0
    free(buf);
512
0
    if (err == ERANGE) {
513
      /* |buf| was too small, retry with a bigger one. */
514
0
      sz <<= 1;
515
0
    } else if (err != 0) {
516
      /* We got an error not related to the size of |buf|. */
517
0
      return -err;
518
0
    } else if (!pgr) {
519
      /* Not found. */
520
0
      return -ENOENT;
521
0
    } else {
522
0
      *gid = pgr->gr_gid;
523
0
      return 0;
524
0
    }
525
0
  } while (sz <= MAX_GRENT_SZ);
526
527
  /* A buffer of size MAX_GRENT_SZ is still too small, return an error. */
528
0
  return -ERANGE;
529
0
}
530
531
static bool seccomp_action_is_available(const char *wanted)
532
0
{
533
0
  if (is_android()) {
534
    /*
535
     * Accessing |actions_avail| is generating SELinux denials, so
536
     * skip for now.
537
     * TODO(crbug.com/978022, jorgelo): Remove once the denial is
538
     * fixed.
539
     */
540
0
    return false;
541
0
  }
542
0
  static const char actions_avail_path[] =
543
0
      "/proc/sys/kernel/seccomp/actions_avail";
544
0
  attribute_cleanup_fp FILE *f = fopen(actions_avail_path, "re");
545
546
0
  if (!f) {
547
0
    pwarn("fopen(%s) failed", actions_avail_path);
548
0
    return false;
549
0
  }
550
551
0
  attribute_cleanup_str char *actions_avail = NULL;
552
0
  size_t buf_size = 0;
553
0
  if (getline(&actions_avail, &buf_size, f) < 0) {
554
0
    pwarn("getline() failed");
555
0
    return false;
556
0
  }
557
558
  /*
559
   * This is just substring search, which means that partial matches will
560
   * match too (e.g. "action" would match "longaction"). There are no
561
   * seccomp actions which include other actions though, so we're good for
562
   * now. Eventually we might want to split the string by spaces.
563
   */
564
0
  return strstr(actions_avail, wanted) != NULL;
565
0
}
566
567
int seccomp_ret_log_available(void)
568
0
{
569
0
  static int ret_log_available = -1;
570
571
0
  if (ret_log_available == -1)
572
0
    ret_log_available = seccomp_action_is_available("log");
573
574
0
  return ret_log_available;
575
0
}
576
577
int seccomp_ret_kill_process_available(void)
578
0
{
579
0
  static int ret_kill_process_available = -1;
580
581
0
  if (ret_kill_process_available == -1)
582
0
    ret_kill_process_available =
583
0
        seccomp_action_is_available("kill_process");
584
585
0
  return ret_kill_process_available;
586
0
}
587
588
bool sys_set_no_new_privs(void)
589
0
{
590
  /*
591
   * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
592
   * in the kernel source tree for an explanation of the parameters.
593
   */
594
0
  if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0) {
595
0
    return true;
596
0
  } else {
597
0
    pwarn("prctl(PR_SET_NO_NEW_PRIVS) failed");
598
0
    return false;
599
0
  }
600
0
}
601
602
bool seccomp_filter_flags_available(unsigned int flags)
603
0
{
604
0
  return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 ||
605
0
         errno != EINVAL;
606
0
}
607
608
bool is_canonical_path(const char *path)
609
0
{
610
0
  attribute_cleanup_str char *rp = realpath(path, NULL);
611
0
  if (!rp) {
612
0
    pwarn("realpath(%s) failed", path);
613
0
    return false;
614
0
  }
615
616
0
  if (streq(path, rp)) {
617
0
    return true;
618
0
  }
619
620
0
  size_t path_len = strlen(path);
621
0
  size_t rp_len = strlen(rp);
622
  /* If |path| has a single trailing slash, that's OK. */
623
0
  return path_len == rp_len + 1 && strncmp(path, rp, rp_len) == 0 &&
624
0
         path[path_len - 1] == '/';
625
0
}