Coverage Report

Created: 2024-09-08 06:35

/src/crosvm/third_party/minijail/system.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright 2017 The ChromiumOS Authors
2
 * Use of this source code is governed by a BSD-style license that can be
3
 * found in the LICENSE file.
4
 */
5
6
#include "system.h"
7
8
#include <errno.h>
9
#include <fcntl.h>
10
#include <grp.h>
11
#include <net/if.h>
12
#include <pwd.h>
13
#include <stdbool.h>
14
#include <stdio.h>
15
#include <string.h>
16
#include <sys/ioctl.h>
17
#include <sys/prctl.h>
18
#include <sys/socket.h>
19
#include <sys/stat.h>
20
#include <sys/statvfs.h>
21
#include <unistd.h>
22
23
#include <linux/securebits.h>
24
25
#include "syscall_wrapper.h"
26
#include "util.h"
27
28
/*
29
 * SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
30
 * definition if the securebits header doesn't provide it.
31
 */
32
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE
33
#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
34
#endif
35
36
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
37
#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
38
#endif
39
40
/*
41
 * Assert the value of SECURE_ALL_BITS at compile-time.
42
 * Android devices are currently compiled against 4.4 kernel headers. Kernel 4.3
43
 * added a new securebit.
44
 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
45
 * when used on older kernels. The compile-time assert will catch this situation
46
 * at compile time.
47
 */
48
#if defined(__ANDROID__)
49
_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
50
#endif
51
52
/* Used by lookup_(user|group) functions. */
53
0
#define MAX_PWENT_SZ (1 << 20)
54
0
#define MAX_GRENT_SZ (1 << 20)
55
56
int secure_noroot_set_and_locked(uint64_t mask)
57
0
{
58
0
  return (mask & (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED)) ==
59
0
         (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED);
60
0
}
61
62
int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
63
0
{
64
  /* The general idea is to set all bits, subject to exceptions below. */
65
0
  unsigned long securebits = SECURE_ALL_BITS | SECURE_ALL_LOCKS;
66
67
  /*
68
   * SECBIT_KEEP_CAPS is special in that it is automatically cleared on
69
   * execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
70
   * the default) in processes that have it locked already (such as nested
71
   * minijail usage) would fail. Thus, unless the caller requires it,
72
   * allow it to remain off if it is already locked.
73
   */
74
0
  if (!require_keep_caps) {
75
0
    int current_securebits = prctl(PR_GET_SECUREBITS);
76
0
    if (current_securebits < 0) {
77
0
      pwarn("prctl(PR_GET_SECUREBITS) failed");
78
0
      return -1;
79
0
    }
80
81
0
    if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
82
0
        (current_securebits & SECBIT_KEEP_CAPS) == 0) {
83
0
      securebits &= ~SECBIT_KEEP_CAPS;
84
0
    }
85
0
  }
86
87
  /*
88
   * Ambient capabilities can only be raised if they're already present
89
   * in the permitted *and* inheritable set. Therefore, we don't really
90
   * need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
91
   * configuring the permitted and inheritable set.
92
   */
93
0
  securebits &=
94
0
      ~(SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);
95
96
  /* Don't set any bits that the user requested not to be touched. */
97
0
  securebits &= ~skip_mask;
98
99
0
  if (!securebits) {
100
0
    warn("not locking any securebits");
101
0
    return 0;
102
0
  }
103
0
  int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
104
0
  if (securebits_ret < 0) {
105
0
    pwarn("prctl(PR_SET_SECUREBITS) failed");
106
0
    return -1;
107
0
  }
108
109
0
  return 0;
110
0
}
111
112
int write_proc_file(pid_t pid, const char *content, const char *basename)
113
0
{
114
0
  attribute_cleanup_fd int fd = -1;
115
0
  int ret;
116
0
  size_t sz, len;
117
0
  ssize_t written;
118
0
  char filename[32];
119
120
0
  sz = sizeof(filename);
121
0
  ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
122
0
  if (ret < 0 || (size_t)ret >= sz) {
123
0
    warn("failed to generate %s filename", basename);
124
0
    return -1;
125
0
  }
126
127
0
  fd = open(filename, O_WRONLY | O_CLOEXEC);
128
0
  if (fd < 0) {
129
0
    pwarn("failed to open '%s'", filename);
130
0
    return -errno;
131
0
  }
132
133
0
  len = strlen(content);
134
0
  written = write(fd, content, len);
135
0
  if (written < 0) {
136
0
    pwarn("failed to write '%s'", filename);
137
0
    return -errno;
138
0
  }
139
140
0
  if ((size_t)written < len) {
141
0
    warn("failed to write %zu bytes to '%s'", len, filename);
142
0
    return -1;
143
0
  }
144
0
  return 0;
145
0
}
146
147
/*
148
 * We specifically do not use cap_valid() as that only tells us the last
149
 * valid cap we were *compiled* against (i.e. what the version of kernel
150
 * headers says). If we run on a different kernel version, then it's not
151
 * uncommon for that to be less (if an older kernel) or more (if a newer
152
 * kernel).
153
 * Normally, we suck up the answer via /proc. On Android, not all processes are
154
 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
155
 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
156
 */
157
unsigned int get_last_valid_cap(void)
158
0
{
159
0
  unsigned int last_valid_cap = 0;
160
0
  if (is_android()) {
161
0
    for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
162
0
         ++last_valid_cap)
163
0
      ;
164
165
    /* |last_valid_cap| will be the first failing value. */
166
0
    if (last_valid_cap > 0) {
167
0
      last_valid_cap--;
168
0
    }
169
0
  } else {
170
0
    static const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
171
0
    FILE *fp = fopen(cap_file, "re");
172
0
    if (!fp)
173
0
      pdie("fopen(%s)", cap_file);
174
0
    if (fscanf(fp, "%u", &last_valid_cap) != 1)
175
0
      pdie("fscanf(%s)", cap_file);
176
0
    fclose(fp);
177
0
  }
178
  /* Caps are bitfields stored in 64-bit int. */
179
0
  if (last_valid_cap > 64)
180
0
    pdie("unable to detect last valid cap: %u > 64",
181
0
         last_valid_cap);
182
0
  return last_valid_cap;
183
0
}
184
185
int cap_ambient_supported(void)
186
0
{
187
0
  return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
188
0
         0;
189
0
}
190
191
int config_net_loopback(void)
192
0
{
193
0
  const char ifname[] = "lo";
194
0
  attribute_cleanup_fd int sock = -1;
195
0
  struct ifreq ifr;
196
197
  /* Make sure people don't try to add really long names. */
198
0
  _Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");
199
200
0
  sock = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
201
0
  if (sock < 0) {
202
0
    pwarn("socket(AF_LOCAL) failed");
203
0
    return -1;
204
0
  }
205
206
  /*
207
   * Do the equiv of `ip link set up lo`.  The kernel will assign
208
   * IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
209
   */
210
0
  strcpy(ifr.ifr_name, ifname);
211
0
  if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
212
0
    pwarn("ioctl(SIOCGIFFLAGS) failed");
213
0
    return -1;
214
0
  }
215
216
  /* The kernel preserves ifr.ifr_name for use. */
217
0
  ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
218
0
  if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
219
0
    pwarn("ioctl(SIOCSIFFLAGS) failed");
220
0
    return -1;
221
0
  }
222
223
0
  return 0;
224
0
}
225
226
int write_pid_to_path(pid_t pid, const char *path)
227
0
{
228
0
  FILE *fp = fopen(path, "we");
229
230
0
  if (!fp) {
231
0
    pwarn("failed to open '%s'", path);
232
0
    return -errno;
233
0
  }
234
0
  if (fprintf(fp, "%d\n", (int)pid) < 0) {
235
    /* fprintf(3) does not set errno on failure. */
236
0
    warn("fprintf(%s) failed", path);
237
0
    fclose(fp);
238
0
    return -1;
239
0
  }
240
0
  if (fclose(fp)) {
241
0
    pwarn("fclose(%s) failed", path);
242
0
    return -errno;
243
0
  }
244
245
0
  return 0;
246
0
}
247
248
/*
249
 * Create the |path| directory and its parents (if need be) with |mode|.
250
 * If not |isdir|, then |path| is actually a file, so the last component
251
 * will not be created.
252
 */
253
int mkdir_p(const char *path, mode_t mode, bool isdir)
254
0
{
255
0
  int rc;
256
0
  char *dir = strdup(path);
257
0
  if (!dir) {
258
0
    rc = errno;
259
0
    pwarn("strdup(%s) failed", path);
260
0
    return -rc;
261
0
  }
262
263
  /* Starting from the root, work our way out to the end. */
264
0
  char *p = strchr(dir + 1, '/');
265
0
  while (p) {
266
0
    *p = '\0';
267
0
    if (mkdir(dir, mode) && errno != EEXIST) {
268
0
      rc = errno;
269
0
      pwarn("mkdir(%s, 0%o) failed", dir, mode);
270
0
      free(dir);
271
0
      return -rc;
272
0
    }
273
0
    *p = '/';
274
0
    p = strchr(p + 1, '/');
275
0
  }
276
277
  /*
278
   * Create the last directory.  We still check EEXIST here in case
279
   * of trailing slashes.
280
   */
281
0
  free(dir);
282
0
  if (isdir && mkdir(path, mode) && errno != EEXIST) {
283
0
    rc = errno;
284
0
    pwarn("mkdir(%s, 0%o) failed", path, mode);
285
0
    return -rc;
286
0
  }
287
0
  return 0;
288
0
}
289
290
/*
291
 * get_mount_flags: Obtain the mount flags of the mount where |source| lives.
292
 */
293
int get_mount_flags(const char *source, unsigned long *mnt_flags)
294
0
{
295
0
  if (mnt_flags) {
296
0
    struct statvfs stvfs_buf;
297
0
    int rc = statvfs(source, &stvfs_buf);
298
0
    if (rc) {
299
0
      rc = errno;
300
0
      pwarn("failed to look up mount flags: source=%s",
301
0
            source);
302
0
      return -rc;
303
0
    }
304
0
    *mnt_flags = stvfs_buf.f_flag;
305
0
  }
306
0
  return 0;
307
0
}
308
309
/*
310
 * setup_mount_destination: Ensures the mount target exists.
311
 * Creates it if needed and possible.
312
 */
313
int setup_mount_destination(const char *source, const char *dest, uid_t uid,
314
          uid_t gid, bool bind)
315
0
{
316
0
  int rc;
317
0
  struct stat st_buf;
318
0
  bool domkdir;
319
320
0
  rc = stat(dest, &st_buf);
321
0
  if (rc == 0) /* destination exists */
322
0
    return 0;
323
324
  /*
325
   * Try to create the destination.
326
   * Either make a directory or touch a file depending on the source type.
327
   *
328
   * If the source isn't an absolute path, assume it is a filesystem type
329
   * such as "tmpfs" and create a directory to mount it on.  The dest will
330
   * be something like "none" or "proc" which we shouldn't be checking.
331
   */
332
0
  if (source[0] == '/') {
333
    /* The source is an absolute path -- it better exist! */
334
0
    rc = stat(source, &st_buf);
335
0
    if (rc) {
336
0
      rc = errno;
337
0
      pwarn("stat(%s) failed", source);
338
0
      return -rc;
339
0
    }
340
341
    /*
342
     * If bind mounting, we only create a directory if the source
343
     * is a directory, else we always bind mount it as a file to
344
     * support device nodes, sockets, etc...
345
     *
346
     * For all other mounts, we assume a block/char source is
347
     * going to want a directory to mount to.  If the source is
348
     * something else (e.g. a fifo or socket), this probably will
349
     * not do the right thing, but we'll fail later on when we try
350
     * to mount(), so shouldn't be a big deal.
351
     */
352
0
    domkdir = S_ISDIR(st_buf.st_mode) ||
353
0
        (!bind && (S_ISBLK(st_buf.st_mode) ||
354
0
             S_ISCHR(st_buf.st_mode)));
355
0
  } else {
356
    /* The source is a relative path -- assume it's a pseudo fs. */
357
358
    /* Disallow relative bind mounts. */
359
0
    if (bind) {
360
0
      warn("relative bind-mounts are not allowed: source=%s",
361
0
           source);
362
0
      return -EINVAL;
363
0
    }
364
365
0
    domkdir = true;
366
0
  }
367
368
  /*
369
   * Now that we know what we want to do, do it!
370
   * We always create the intermediate dirs and the final path with 0755
371
   * perms and root/root ownership.  This shouldn't be a problem because
372
   * the actual mount will set those perms/ownership on the mount point
373
   * which is all people should need to access it.
374
   */
375
0
  rc = mkdir_p(dest, 0755, domkdir);
376
0
  if (rc)
377
0
    return rc;
378
0
  if (!domkdir) {
379
0
    attribute_cleanup_fd int fd =
380
0
        open(dest, O_RDWR | O_CREAT | O_CLOEXEC, 0700);
381
0
    if (fd < 0) {
382
0
      rc = errno;
383
0
      pwarn("open(%s) failed", dest);
384
0
      return -rc;
385
0
    }
386
0
  }
387
0
  if (chown(dest, uid, gid)) {
388
0
    rc = errno;
389
0
    pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
390
0
    return -rc;
391
0
  }
392
0
  return 0;
393
0
}
394
395
/*
396
 * lookup_user: Gets the uid/gid for the given username.
397
 */
398
int lookup_user(const char *user, uid_t *uid, gid_t *gid)
399
0
{
400
0
  char *buf = NULL;
401
0
  struct passwd pw;
402
0
  struct passwd *ppw = NULL;
403
  /*
404
   * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
405
   * a suggested starting size for the buffer, so let's try getting this
406
   * size first, and fallback to a default othersise.
407
   */
408
0
  ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
409
0
  if (sz == -1)
410
0
    sz = 65536; /* your guess is as good as mine... */
411
412
0
  do {
413
0
    buf = malloc(sz);
414
0
    if (!buf)
415
0
      return -ENOMEM;
416
0
    int err = getpwnam_r(user, &pw, buf, sz, &ppw);
417
    /*
418
     * We're safe to free the buffer here. The strings inside |pw|
419
     * point inside |buf|, but we don't use any of them; this leaves
420
     * the pointers dangling but it's safe.
421
     * |ppw| points at |pw| if getpwnam_r(3) succeeded.
422
     */
423
0
    free(buf);
424
0
    if (err == ERANGE) {
425
      /* |buf| was too small, retry with a bigger one. */
426
0
      sz <<= 1;
427
0
    } else if (err != 0) {
428
      /* We got an error not related to the size of |buf|. */
429
0
      return -err;
430
0
    } else if (!ppw) {
431
      /* Not found. */
432
0
      return -ENOENT;
433
0
    } else {
434
0
      *uid = ppw->pw_uid;
435
0
      *gid = ppw->pw_gid;
436
0
      return 0;
437
0
    }
438
0
  } while (sz <= MAX_PWENT_SZ);
439
440
  /* A buffer of size MAX_PWENT_SZ is still too small, return an error. */
441
0
  return -ERANGE;
442
0
}
443
444
/*
445
 * lookup_group: Gets the gid for the given group name.
446
 */
447
int lookup_group(const char *group, gid_t *gid)
448
0
{
449
0
  char *buf = NULL;
450
0
  struct group gr;
451
0
  struct group *pgr = NULL;
452
  /*
453
   * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
454
   * a suggested starting size for the buffer, so let's try getting this
455
   * size first, and fallback to a default otherwise.
456
   */
457
0
  ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
458
0
  if (sz == -1)
459
0
    sz = 65536; /* and mine is as good as yours, really */
460
461
0
  do {
462
0
    buf = malloc(sz);
463
0
    if (!buf)
464
0
      return -ENOMEM;
465
0
    int err = getgrnam_r(group, &gr, buf, sz, &pgr);
466
    /*
467
     * We're safe to free the buffer here. The strings inside |gr|
468
     * point inside |buf|, but we don't use any of them; this leaves
469
     * the pointers dangling but it's safe.
470
     * |pgr| points at |gr| if getgrnam_r(3) succeeded.
471
     */
472
0
    free(buf);
473
0
    if (err == ERANGE) {
474
      /* |buf| was too small, retry with a bigger one. */
475
0
      sz <<= 1;
476
0
    } else if (err != 0) {
477
      /* We got an error not related to the size of |buf|. */
478
0
      return -err;
479
0
    } else if (!pgr) {
480
      /* Not found. */
481
0
      return -ENOENT;
482
0
    } else {
483
0
      *gid = pgr->gr_gid;
484
0
      return 0;
485
0
    }
486
0
  } while (sz <= MAX_GRENT_SZ);
487
488
  /* A buffer of size MAX_GRENT_SZ is still too small, return an error. */
489
0
  return -ERANGE;
490
0
}
491
492
static bool seccomp_action_is_available(const char *wanted)
493
0
{
494
0
  if (is_android()) {
495
    /*
496
     * Accessing |actions_avail| is generating SELinux denials, so
497
     * skip for now.
498
     * TODO(crbug.com/978022, jorgelo): Remove once the denial is
499
     * fixed.
500
     */
501
0
    return false;
502
0
  }
503
0
  const char actions_avail_path[] =
504
0
      "/proc/sys/kernel/seccomp/actions_avail";
505
0
  FILE *f = fopen(actions_avail_path, "re");
506
507
0
  if (!f) {
508
0
    pwarn("fopen(%s) failed", actions_avail_path);
509
0
    return false;
510
0
  }
511
512
0
  attribute_cleanup_str char *actions_avail = NULL;
513
0
  size_t buf_size = 0;
514
0
  if (getline(&actions_avail, &buf_size, f) < 0) {
515
0
    pwarn("getline() failed");
516
0
    return false;
517
0
  }
518
519
  /*
520
   * This is just substring search, which means that partial matches will
521
   * match too (e.g. "action" would match "longaction"). There are no
522
   * seccomp actions which include other actions though, so we're good for
523
   * now. Eventually we might want to split the string by spaces.
524
   */
525
0
  return strstr(actions_avail, wanted) != NULL;
526
0
}
527
528
int seccomp_ret_log_available(void)
529
0
{
530
0
  static int ret_log_available = -1;
531
532
0
  if (ret_log_available == -1)
533
0
    ret_log_available = seccomp_action_is_available("log");
534
535
0
  return ret_log_available;
536
0
}
537
538
int seccomp_ret_kill_process_available(void)
539
0
{
540
0
  static int ret_kill_process_available = -1;
541
542
0
  if (ret_kill_process_available == -1)
543
0
    ret_kill_process_available =
544
0
        seccomp_action_is_available("kill_process");
545
546
0
  return ret_kill_process_available;
547
0
}
548
549
bool sys_set_no_new_privs(void)
550
0
{
551
  /*
552
   * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
553
   * in the kernel source tree for an explanation of the parameters.
554
   */
555
0
  if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0) {
556
0
    return true;
557
0
  } else {
558
0
    pwarn("prctl(PR_SET_NO_NEW_PRIVS) failed");
559
0
    return false;
560
0
  }
561
0
}
562
563
bool seccomp_filter_flags_available(unsigned int flags)
564
0
{
565
0
  return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 ||
566
0
         errno != EINVAL;
567
0
}
568
569
bool is_canonical_path(const char *path)
570
0
{
571
0
  attribute_cleanup_str char *rp = realpath(path, NULL);
572
0
  if (!rp) {
573
0
    pwarn("realpath(%s) failed", path);
574
0
    return false;
575
0
  }
576
577
0
  if (streq(path, rp)) {
578
0
    return true;
579
0
  }
580
581
0
  size_t path_len = strlen(path);
582
0
  size_t rp_len = strlen(rp);
583
  /* If |path| has a single trailing slash, that's OK. */
584
0
  return path_len == rp_len + 1 && strncmp(path, rp, rp_len) == 0 &&
585
0
         path[path_len - 1] == '/';
586
0
}