Coverage Report

Created: 2026-03-31 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/crosvm/third_party/minijail/libminijail.c
Line
Count
Source
1
/* Copyright 2012 The ChromiumOS Authors
2
 * Use of this source code is governed by a BSD-style license that can be
3
 * found in the LICENSE file.
4
 */
5
6
#define _BSD_SOURCE
7
#define _DEFAULT_SOURCE
8
#define _GNU_SOURCE
9
10
#include <asm/unistd.h>
11
#include <assert.h>
12
#include <dirent.h>
13
#include <errno.h>
14
#include <fcntl.h>
15
#include <grp.h>
16
#include <linux/capability.h>
17
#include <linux/filter.h>
18
#include <sched.h>
19
#include <signal.h>
20
#include <stddef.h>
21
#include <stdio.h>
22
#include <stdlib.h>
23
#include <string.h>
24
#include <sys/capability.h>
25
#include <sys/mount.h>
26
#include <sys/param.h>
27
#include <sys/prctl.h>
28
#include <sys/resource.h>
29
#include <sys/stat.h>
30
#include <sys/sysmacros.h>
31
#include <sys/types.h>
32
#include <sys/user.h>
33
#include <sys/wait.h>
34
#include <syscall.h>
35
#include <unistd.h>
36
37
#include "landlock_util.h"
38
#include "libminijail-private.h"
39
#include "libminijail.h"
40
41
#include "signal_handler.h"
42
#include "syscall_filter.h"
43
#include "syscall_wrapper.h"
44
#include "system.h"
45
#include "util.h"
46
47
/* Until these are reliably available in linux/prctl.h. */
48
#ifndef PR_ALT_SYSCALL
49
0
#define PR_ALT_SYSCALL 0x43724f53
50
#endif
51
52
/* New cgroup namespace might not be in linux-headers yet. */
53
#ifndef CLONE_NEWCGROUP
54
#define CLONE_NEWCGROUP 0x02000000
55
#endif
56
57
0
#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
58
59
0
#define MAX_RLIMITS 32 /* Currently there are 15 supported by Linux. */
60
61
0
#define MAX_PRESERVED_FDS 128U
62
63
/* Keyctl commands. */
64
0
#define KEYCTL_JOIN_SESSION_KEYRING 1
65
66
/*
67
 * The userspace equivalent of MNT_USER_SETTABLE_MASK, which is the mask of all
68
 * flags that can be modified by MS_REMOUNT.
69
 */
70
#define MS_USER_SETTABLE_MASK                                                  \
71
0
  (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME |       \
72
0
   MS_RELATIME | MS_RDONLY)
73
74
/*
75
 * Required for Android host glibc which is permanently stuck on 2.17. Causes
76
 * no harm for newer glibc versions.
77
 */
78
#ifndef MS_NOSYMFOLLOW
79
/* Added locally in kernels 4.x+. */
80
0
#define MS_NOSYMFOLLOW 256
81
#endif
82
83
struct minijail_rlimit {
84
  int type;
85
  rlim_t cur;
86
  rlim_t max;
87
};
88
89
struct mountpoint {
90
  char *src;
91
  char *dest;
92
  char *type;
93
  char *data;
94
  int has_data;
95
  unsigned long flags;
96
  struct mountpoint *next;
97
};
98
99
struct minijail_remount {
100
  unsigned long remount_mode;
101
  char *mount_name;
102
  struct minijail_remount *next;
103
};
104
105
struct hook {
106
  minijail_hook_t hook;
107
  void *payload;
108
  minijail_hook_event_t event;
109
  struct hook *next;
110
};
111
112
struct fs_rule {
113
  char *path;
114
  uint64_t landlock_flags;
115
  struct fs_rule *next;
116
};
117
118
struct preserved_fd {
119
  int parent_fd;
120
  int child_fd;
121
};
122
123
/*
124
 * minijail struct: new fields should either be marshaled/unmarshaled or have a
125
 * comment explaining why that's unnecessary.
126
 */
127
struct minijail {
128
  /*
129
   * WARNING: new bool flags should always be added to this struct,
130
   * unless you’re certain they don’t need to remain after marshaling.
131
   * If you add a flag here you need to make sure it's
132
   * accounted for in minijail_pre{enter|exec}() below.
133
   */
134
  struct {
135
    bool uid : 1;
136
    bool gid : 1;
137
    bool inherit_suppl_gids : 1;
138
    bool set_suppl_gids : 1;
139
    bool keep_suppl_gids : 1;
140
    bool use_caps : 1;
141
    bool capbset_drop : 1;
142
    bool set_ambient_caps : 1;
143
    bool vfs : 1;
144
    bool enter_vfs : 1;
145
    bool pids : 1;
146
    bool ipc : 1;
147
    bool uts : 1;
148
    bool net : 1;
149
    bool net_loopback : 1;
150
    bool enter_net : 1;
151
    bool ns_cgroups : 1;
152
    bool userns : 1;
153
    bool disable_setgroups : 1;
154
    bool seccomp : 1;
155
    bool remount_proc_ro : 1;
156
    bool no_new_privs : 1;
157
    bool seccomp_filter : 1;
158
    bool seccomp_filter_tsync : 1;
159
    bool seccomp_filter_logging : 1;
160
    bool seccomp_filter_allow_speculation : 1;
161
    bool chroot : 1;
162
    bool pivot_root : 1;
163
    bool mount_dev : 1;
164
    bool mount_tmp : 1;
165
    bool do_init : 1;
166
    bool run_as_init : 1;
167
    bool pid_file : 1;
168
    bool cgroups : 1;
169
    bool alt_syscall : 1;
170
    bool reset_signal_mask : 1;
171
    bool reset_signal_handlers : 1;
172
    bool close_open_fds : 1;
173
    bool new_session_keyring : 1;
174
    bool forward_signals : 1;
175
    bool setsid : 1;
176
    bool using_minimalistic_mountns : 1;
177
    bool enable_fs_restrictions : 1;
178
    bool enable_profile_fs_restrictions : 1;
179
    bool enable_default_runtime : 1;
180
    bool enable_new_sessions : 1;
181
  } flags;
182
  uid_t uid;
183
  gid_t gid;
184
  gid_t usergid;
185
  char *user;
186
  size_t suppl_gid_count;
187
  gid_t *suppl_gid_list;
188
  uint64_t caps;
189
  uint64_t cap_bset;
190
  pid_t initpid;
191
  int mountns_fd;
192
  int netns_fd;
193
  int fs_rules_fd;
194
  int fs_rules_landlock_abi;
195
  char *chrootdir;
196
  char *pid_file_path;
197
  char *uidmap;
198
  char *gidmap;
199
  char *hostname;
200
  char *preload_path;
201
  /*
202
   * Filename that will be executed, unless an ELF fd is used instead.
203
   * This field is only used for logs and isn't included in marshaling.
204
   */
205
  char *filename;
206
  size_t filter_len;
207
  struct sock_fprog *filter_prog;
208
  char *alt_syscall_table;
209
  struct mountpoint *mounts_head;
210
  struct mountpoint *mounts_tail;
211
  size_t mounts_count;
212
  unsigned long remount_mode;
213
  struct minijail_remount *remounts_head;
214
  struct minijail_remount *remounts_tail;
215
  size_t tmpfs_size;
216
  struct fs_rule *fs_rules_head;
217
  struct fs_rule *fs_rules_tail;
218
  size_t fs_rules_count;
219
  char *cgroups[MAX_CGROUPS];
220
  size_t cgroup_count;
221
  struct minijail_rlimit rlimits[MAX_RLIMITS];
222
  size_t rlimit_count;
223
  uint64_t securebits_skip_mask;
224
  struct hook *hooks_head;
225
  struct hook *hooks_tail;
226
  struct preserved_fd preserved_fds[MAX_PRESERVED_FDS];
227
  size_t preserved_fd_count;
228
  char *seccomp_policy_path;
229
};
230
231
static void run_hooks_or_die(const struct minijail *j,
232
           minijail_hook_event_t event);
233
234
static bool seccomp_is_logging_allowed(const struct minijail *j)
235
0
{
236
0
  return seccomp_default_ret_log() || j->flags.seccomp_filter_logging;
237
0
}
238
239
static void free_mounts_list(struct minijail *j)
240
0
{
241
0
  while (j->mounts_head) {
242
0
    struct mountpoint *m = j->mounts_head;
243
0
    j->mounts_head = j->mounts_head->next;
244
0
    free(m->data);
245
0
    free(m->type);
246
0
    free(m->dest);
247
0
    free(m->src);
248
0
    free(m);
249
0
  }
250
  // No need to clear mounts_head as we know it's NULL after the loop.
251
0
  j->mounts_tail = NULL;
252
0
}
253
254
static void free_remounts_list(struct minijail *j)
255
0
{
256
0
  while (j->remounts_head) {
257
0
    struct minijail_remount *m = j->remounts_head;
258
0
    j->remounts_head = j->remounts_head->next;
259
0
    free(m->mount_name);
260
0
    free(m);
261
0
  }
262
  // No need to clear remounts_head as we know it's NULL after the loop.
263
0
  j->remounts_tail = NULL;
264
0
}
265
266
static void free_fs_rules_list(struct minijail *j)
267
0
{
268
0
  while (j->fs_rules_head) {
269
0
    struct fs_rule *r = j->fs_rules_head;
270
0
    j->fs_rules_head = j->fs_rules_head->next;
271
0
    free(r->path);
272
0
    free(r);
273
0
  }
274
0
  j->fs_rules_tail = NULL;
275
0
}
276
277
/*
278
 * Writes exactly n bytes from buf to file descriptor fd.
279
 * Returns 0 on success or a negative error code on error.
280
 */
281
static int write_exactly(int fd, const void *buf, size_t n)
282
0
{
283
0
  const char *p = buf;
284
0
  while (n > 0) {
285
0
    const ssize_t written = write(fd, p, n);
286
0
    if (written < 0) {
287
0
      if (errno == EINTR)
288
0
        continue;
289
290
0
      return -errno;
291
0
    }
292
293
0
    p += written;
294
0
    n -= written;
295
0
  }
296
297
0
  return 0;
298
0
}
299
300
/*
301
 * Reads exactly n bytes from file descriptor fd into buf.
302
 * Returns 0 on success or a negative error code on error.
303
 */
304
static int read_exactly(int fd, void *buf, size_t n)
305
0
{
306
0
  char *p = buf;
307
0
  while (n > 0) {
308
0
    const ssize_t bytes = read(fd, p, n);
309
0
    if (bytes < 0) {
310
0
      if (errno == EINTR)
311
0
        continue;
312
313
0
      return -errno;
314
0
    }
315
0
    if (bytes == 0) {
316
0
      errno = EPIPE;
317
0
      return -EPIPE;
318
0
    }
319
320
0
    p += bytes;
321
0
    n -= bytes;
322
0
  }
323
324
0
  return 0;
325
0
}
326
327
/* Closes *pfd and sets it to -1. */
328
static void close_and_reset(int *pfd)
329
0
{
330
0
  if (*pfd != -1)
331
0
    close(*pfd);
332
0
  *pfd = -1;
333
0
}
334
335
/*
336
 * Strip out flags meant for the parent.
337
 * We keep things that are not inherited across execve(2) (e.g. capabilities),
338
 * or are easier to set after execve(2) (e.g. seccomp filters).
339
 */
340
void minijail_preenter(struct minijail *j)
341
0
{
342
0
  j->flags.vfs = 0;
343
0
  j->flags.enter_vfs = 0;
344
0
  j->flags.ns_cgroups = 0;
345
0
  j->flags.net = 0;
346
0
  j->flags.net_loopback = 0;
347
0
  j->flags.uts = 0;
348
0
  j->flags.remount_proc_ro = 0;
349
0
  j->flags.pids = 0;
350
0
  j->flags.do_init = 0;
351
0
  j->flags.run_as_init = 0;
352
0
  j->flags.pid_file = 0;
353
0
  j->flags.cgroups = 0;
354
0
  j->flags.forward_signals = 0;
355
0
  j->flags.setsid = 0;
356
0
  j->remount_mode = 0;
357
0
  j->flags.using_minimalistic_mountns = 0;
358
0
  j->flags.enable_profile_fs_restrictions = 0;
359
0
  j->flags.enable_default_runtime = 0;
360
0
  j->flags.enable_new_sessions = 0;
361
0
  free_remounts_list(j);
362
0
}
363
364
static bool fs_refer_restriction_supported(struct minijail *j)
365
0
{
366
0
  if (j->fs_rules_landlock_abi < 0) {
367
0
    const int abi = landlock_create_ruleset(
368
0
        NULL, 0, LANDLOCK_CREATE_RULESET_VERSION);
369
    /*
370
     * If we have a valid ABI, save the result. Otherwise, leave
371
     * the struct field unmodified to make sure it's correctly
372
     * marshaled and unmarshaled.
373
     */
374
0
    if (abi > 0) {
375
0
      j->fs_rules_landlock_abi = abi;
376
0
    }
377
0
  }
378
379
0
  return j->fs_rules_landlock_abi >= LANDLOCK_ABI_FS_REFER_SUPPORTED;
380
0
}
381
382
/* Sets fs_rules_fd to an empty ruleset, if Landlock is available. */
383
static int setup_fs_rules_fd(struct minijail *j)
384
0
{
385
0
  struct minijail_landlock_ruleset_attr ruleset_attr = {
386
0
      .handled_access_fs = HANDLED_ACCESS_TYPES};
387
0
  if (fs_refer_restriction_supported(j)) {
388
0
    ruleset_attr.handled_access_fs |= LANDLOCK_ACCESS_FS_REFER;
389
0
  }
390
391
0
  j->fs_rules_fd =
392
0
      landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
393
0
  if (j->fs_rules_fd < 0) {
394
    /*
395
     * As of Landlock ABI=3, the useful errors we expect here are
396
     * ENOSYS or EOPNOTSUPP. In both cases, Landlock is not
397
     * supported by the kernel and Minijail can silently ignore it.
398
     * TODO(b/300142205): log when we no longer have 5.4 kernels in
399
     * ChromeOS (~EoY 2024).
400
     */
401
0
    return errno;
402
0
  }
403
404
0
  return 0;
405
0
}
406
407
/* Adds a rule for a given path to apply once minijail is entered. */
408
static int add_fs_restriction_path(struct minijail *j, const char *path,
409
           uint64_t landlock_flags)
410
0
{
411
0
  struct fs_rule *r = calloc(1, sizeof(*r));
412
0
  if (!r)
413
0
    return -ENOMEM;
414
0
  r->path = strdup(path);
415
0
  r->landlock_flags = landlock_flags;
416
417
0
  if (j->fs_rules_tail) {
418
0
    j->fs_rules_tail->next = r;
419
0
    j->fs_rules_tail = r;
420
0
  } else {
421
0
    j->fs_rules_head = r;
422
0
    j->fs_rules_tail = r;
423
0
  }
424
425
  /*
426
   * If this is our first rule, set up the rules FD early for API users.
427
   *
428
   * This is important for users calling minijail_enter() directly.
429
   * Otherise, this is handled later inside minijail_run_internal().
430
   *
431
   * The reason for this is because setup_fs_rules_fd() needs to be
432
   * called from inside the process that applies Landlock rules. For
433
   * minijail_enter(), that's this process. For minijail_run_internal(),
434
   * that's the child process.
435
   */
436
0
  if (j->fs_rules_count == 0)
437
0
    setup_fs_rules_fd(j);
438
439
0
  j->fs_rules_count++;
440
0
  return 0;
441
0
}
442
443
bool mount_has_bind_flag(struct mountpoint *m)
444
0
{
445
0
  return !!(m->flags & MS_BIND);
446
0
}
447
448
bool mount_has_readonly_flag(struct mountpoint *m)
449
0
{
450
0
  return !!(m->flags & MS_RDONLY);
451
0
}
452
453
bool mount_events_allowed(struct mountpoint *m)
454
0
{
455
0
  return !!(m->flags & MS_SHARED) || !!(m->flags & MS_SLAVE);
456
0
}
457
458
/*
459
 * Strip out flags meant for the child.
460
 * We keep things that are inherited across execve(2).
461
 */
462
void minijail_preexec(struct minijail *j)
463
0
{
464
0
  int vfs = j->flags.vfs;
465
0
  int enter_vfs = j->flags.enter_vfs;
466
0
  int ns_cgroups = j->flags.ns_cgroups;
467
0
  int net = j->flags.net;
468
0
  int net_loopback = j->flags.net_loopback;
469
0
  int uts = j->flags.uts;
470
0
  int remount_proc_ro = j->flags.remount_proc_ro;
471
0
  int userns = j->flags.userns;
472
0
  int using_minimalistic_mountns = j->flags.using_minimalistic_mountns;
473
0
  int enable_fs_restrictions = j->flags.enable_fs_restrictions;
474
0
  int enable_profile_fs_restrictions =
475
0
      j->flags.enable_profile_fs_restrictions;
476
0
  int enable_default_runtime = j->flags.enable_default_runtime;
477
0
  int enable_new_sessions = j->flags.enable_new_sessions;
478
0
  if (j->user)
479
0
    free(j->user);
480
0
  j->user = NULL;
481
0
  if (j->suppl_gid_list)
482
0
    free(j->suppl_gid_list);
483
0
  j->suppl_gid_list = NULL;
484
0
  if (j->preload_path)
485
0
    free(j->preload_path);
486
0
  j->preload_path = NULL;
487
0
  free_mounts_list(j);
488
0
  free_fs_rules_list(j);
489
0
  memset(&j->flags, 0, sizeof(j->flags));
490
  /* Now restore anything we meant to keep. */
491
0
  j->flags.vfs = vfs;
492
0
  j->flags.enter_vfs = enter_vfs;
493
0
  j->flags.ns_cgroups = ns_cgroups;
494
0
  j->flags.net = net;
495
0
  j->flags.net_loopback = net_loopback;
496
0
  j->flags.uts = uts;
497
0
  j->flags.remount_proc_ro = remount_proc_ro;
498
0
  j->flags.userns = userns;
499
0
  j->flags.using_minimalistic_mountns = using_minimalistic_mountns;
500
0
  j->flags.enable_fs_restrictions = enable_fs_restrictions;
501
0
  j->flags.enable_profile_fs_restrictions =
502
0
      enable_profile_fs_restrictions;
503
0
  j->flags.enable_default_runtime = enable_default_runtime;
504
0
  j->flags.enable_new_sessions = enable_new_sessions;
505
  /* Note, |pids| will already have been used before this call. */
506
0
}
507
508
/* Minijail API. */
509
510
struct minijail API *minijail_new(void)
511
0
{
512
0
  struct minijail *j = calloc(1, sizeof(struct minijail));
513
0
  if (j) {
514
0
    j->remount_mode = MS_PRIVATE;
515
0
    j->fs_rules_fd = -1;
516
0
    j->fs_rules_landlock_abi = -1;
517
0
    j->flags.using_minimalistic_mountns = false;
518
0
    j->flags.enable_fs_restrictions = true;
519
0
    j->flags.enable_profile_fs_restrictions = true;
520
0
    j->flags.enable_default_runtime = true;
521
0
    j->flags.enable_new_sessions = true;
522
0
  }
523
0
  return j;
524
0
}
525
526
void API minijail_change_uid(struct minijail *j, uid_t uid)
527
0
{
528
0
  if (uid == 0)
529
0
    die("useless change to uid 0");
530
0
  j->uid = uid;
531
0
  j->flags.uid = 1;
532
0
}
533
534
void API minijail_change_gid(struct minijail *j, gid_t gid)
535
0
{
536
0
  if (gid == 0)
537
0
    die("useless change to gid 0");
538
0
  j->gid = gid;
539
0
  j->flags.gid = 1;
540
0
}
541
542
void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
543
           const gid_t *list)
544
0
{
545
0
  size_t i;
546
547
0
  if (j->flags.inherit_suppl_gids)
548
0
    die("cannot inherit *and* set supplementary groups");
549
0
  if (j->flags.keep_suppl_gids)
550
0
    die("cannot keep *and* set supplementary groups");
551
552
0
  if (size == 0) {
553
    /* Clear supplementary groups. */
554
0
    j->suppl_gid_list = NULL;
555
0
    j->suppl_gid_count = 0;
556
0
    j->flags.set_suppl_gids = 1;
557
0
    return;
558
0
  }
559
560
  /* Copy the gid_t array. */
561
0
  j->suppl_gid_list = calloc(size, sizeof(gid_t));
562
0
  if (!j->suppl_gid_list) {
563
0
    die("failed to allocate internal supplementary group array");
564
0
  }
565
0
  for (i = 0; i < size; i++) {
566
0
    j->suppl_gid_list[i] = list[i];
567
0
  }
568
0
  j->suppl_gid_count = size;
569
0
  j->flags.set_suppl_gids = 1;
570
0
}
571
572
void API minijail_keep_supplementary_gids(struct minijail *j)
573
0
{
574
0
  j->flags.keep_suppl_gids = 1;
575
0
}
576
577
int API minijail_change_user(struct minijail *j, const char *user)
578
0
{
579
0
  uid_t uid;
580
0
  gid_t gid;
581
0
  int rc = lookup_user(user, &uid, &gid);
582
0
  if (rc)
583
0
    return rc;
584
0
  minijail_change_uid(j, uid);
585
0
  j->user = strdup(user);
586
0
  if (!j->user)
587
0
    return -ENOMEM;
588
0
  j->usergid = gid;
589
0
  return 0;
590
0
}
591
592
int API minijail_change_group(struct minijail *j, const char *group)
593
0
{
594
0
  gid_t gid;
595
0
  int rc = lookup_group(group, &gid);
596
0
  if (rc)
597
0
    return rc;
598
0
  minijail_change_gid(j, gid);
599
0
  return 0;
600
0
}
601
602
void API minijail_use_seccomp(struct minijail *j)
603
0
{
604
0
  j->flags.seccomp = 1;
605
0
}
606
607
void API minijail_no_new_privs(struct minijail *j)
608
0
{
609
0
  j->flags.no_new_privs = 1;
610
0
}
611
612
void API minijail_use_seccomp_filter(struct minijail *j)
613
0
{
614
0
  j->flags.seccomp_filter = 1;
615
0
}
616
617
void API minijail_set_seccomp_filter_tsync(struct minijail *j)
618
0
{
619
0
  if (j->filter_len > 0 && j->filter_prog != NULL) {
620
0
    die("minijail_set_seccomp_filter_tsync() must be called "
621
0
        "before minijail_parse_seccomp_filters()");
622
0
  }
623
624
0
  if (seccomp_is_logging_allowed(j) && !seccomp_ret_log_available()) {
625
    /*
626
     * If SECCOMP_RET_LOG is not available, we don't want to use
627
     * SECCOMP_RET_TRAP to both kill the entire process and report
628
     * failing syscalls, since it will be brittle. Just bail.
629
     */
630
0
    die("SECCOMP_RET_LOG not available, cannot use logging with "
631
0
        "thread sync at the same time");
632
0
  }
633
634
0
  j->flags.seccomp_filter_tsync = 1;
635
0
}
636
637
void API minijail_set_seccomp_filter_allow_speculation(struct minijail *j)
638
0
{
639
0
  if (j->filter_len > 0 && j->filter_prog != NULL) {
640
0
    die("minijail_set_seccomp_filter_allow_speculation() must be "
641
0
        "called before minijail_parse_seccomp_filters()");
642
0
  }
643
644
0
  j->flags.seccomp_filter_allow_speculation = 1;
645
0
}
646
647
void API minijail_log_seccomp_filter_failures(struct minijail *j)
648
0
{
649
0
  if (j->filter_len > 0 && j->filter_prog != NULL) {
650
0
    die("minijail_log_seccomp_filter_failures() must be called "
651
0
        "before minijail_parse_seccomp_filters()");
652
0
  }
653
654
0
  if (j->flags.seccomp_filter_tsync && !seccomp_ret_log_available()) {
655
    /*
656
     * If SECCOMP_RET_LOG is not available, we don't want to use
657
     * SECCOMP_RET_TRAP to both kill the entire process and report
658
     * failing syscalls, since it will be brittle. Just bail.
659
     */
660
0
    die("SECCOMP_RET_LOG not available, cannot use thread sync "
661
0
        "with logging at the same time");
662
0
  }
663
664
0
  if (debug_logging_allowed()) {
665
0
    j->flags.seccomp_filter_logging = 1;
666
0
  } else {
667
0
    warn("non-debug build: ignoring request to enable seccomp "
668
0
         "logging");
669
0
  }
670
0
}
671
672
void API minijail_set_using_minimalistic_mountns(struct minijail *j)
673
0
{
674
0
  j->flags.using_minimalistic_mountns = true;
675
0
}
676
677
void API minijail_set_enable_new_sessions(struct minijail *j,
678
            bool enable_new_sessions)
679
0
{
680
0
  j->flags.enable_new_sessions = enable_new_sessions;
681
0
}
682
683
void API minijail_set_enable_default_runtime(struct minijail *j,
684
               bool enable_default_runtime)
685
0
{
686
0
  j->flags.enable_default_runtime = enable_default_runtime;
687
0
}
688
689
bool API minijail_get_enable_default_runtime(struct minijail *j)
690
0
{
691
0
  return j->flags.enable_default_runtime;
692
0
}
693
694
bool API minijail_is_fs_restriction_available(void)
695
0
{
696
0
  const int abi =
697
0
      landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION);
698
  // ABI > 0 is considered supported.
699
0
  return abi > 0;
700
0
}
701
702
void API minijail_disable_fs_restrictions(struct minijail *j)
703
0
{
704
0
  j->flags.enable_fs_restrictions = false;
705
0
}
706
707
void API minijail_set_enable_profile_fs_restrictions(struct minijail *j)
708
0
{
709
0
  j->flags.enable_profile_fs_restrictions = true;
710
0
}
711
712
void API minijail_add_minimalistic_mountns_fs_rules(struct minijail *j)
713
0
{
714
0
  struct mountpoint *m = j->mounts_head;
715
0
  bool landlock_enabled_by_profile = false;
716
0
  if (!j->flags.using_minimalistic_mountns ||
717
0
      !j->flags.enable_profile_fs_restrictions)
718
0
    return;
719
720
  /* Apply Landlock rules. */
721
0
  while (m) {
722
0
    landlock_enabled_by_profile = true;
723
0
    minijail_add_fs_restriction_rx(j, m->dest);
724
    /*
725
     * Allow rw if mounted as writable, or mount flags allow mount
726
     * events.
727
     */
728
0
    if (!mount_has_readonly_flag(m) || mount_events_allowed(m))
729
0
      minijail_add_fs_restriction_advanced_rw(j, m->dest);
730
0
    m = m->next;
731
0
  }
732
0
  if (landlock_enabled_by_profile) {
733
0
    minijail_enable_default_fs_restrictions(j);
734
0
    minijail_add_fs_restriction_edit(j, "/dev");
735
0
    minijail_add_fs_restriction_ro(j, "/proc");
736
0
    if (j->flags.vfs)
737
0
      minijail_add_fs_restriction_rw(j, "/tmp");
738
0
  }
739
0
}
740
741
void API minijail_enable_default_fs_restrictions(struct minijail *j)
742
0
{
743
  // Common library locations.
744
0
  minijail_add_fs_restriction_rx(j, "/lib");
745
0
  minijail_add_fs_restriction_rx(j, "/lib64");
746
0
  minijail_add_fs_restriction_rx(j, "/usr/lib");
747
0
  minijail_add_fs_restriction_rx(j, "/usr/lib64");
748
  // Common locations for services invoking Minijail.
749
0
  minijail_add_fs_restriction_rx(j, "/bin");
750
0
  minijail_add_fs_restriction_rx(j, "/sbin");
751
0
  minijail_add_fs_restriction_rx(j, "/usr/sbin");
752
0
  minijail_add_fs_restriction_rx(j, "/usr/bin");
753
  // Common /etc locations.
754
0
  minijail_add_fs_restriction_ro(j, "/etc/group");
755
0
  minijail_add_fs_restriction_ro(j, "/etc/passwd");
756
0
}
757
758
void API minijail_use_caps(struct minijail *j, uint64_t capmask)
759
0
{
760
  /*
761
   * 'minijail_use_caps' configures a runtime-capabilities-only
762
   * environment, including a bounding set matching the thread's runtime
763
   * (permitted|inheritable|effective) sets.
764
   * Therefore, it will override any existing bounding set configurations
765
   * since the latter would allow gaining extra runtime capabilities from
766
   * file capabilities.
767
   */
768
0
  if (j->flags.capbset_drop) {
769
0
    warn("overriding bounding set configuration");
770
0
    j->cap_bset = 0;
771
0
    j->flags.capbset_drop = 0;
772
0
  }
773
0
  j->caps = capmask;
774
0
  j->flags.use_caps = 1;
775
0
}
776
777
void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
778
0
{
779
0
  if (j->flags.use_caps) {
780
    /*
781
     * 'minijail_use_caps' will have already configured a capability
782
     * bounding set matching the (permitted|inheritable|effective)
783
     * sets. Abort if the user tries to configure a separate
784
     * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
785
     * are mutually exclusive.
786
     */
787
0
    die("runtime capabilities already configured, can't drop "
788
0
        "bounding set separately");
789
0
  }
790
0
  j->cap_bset = capmask;
791
0
  j->flags.capbset_drop = 1;
792
0
}
793
794
void API minijail_set_ambient_caps(struct minijail *j)
795
0
{
796
0
  j->flags.set_ambient_caps = 1;
797
0
}
798
799
void API minijail_reset_signal_mask(struct minijail *j)
800
0
{
801
0
  j->flags.reset_signal_mask = 1;
802
0
}
803
804
void API minijail_reset_signal_handlers(struct minijail *j)
805
0
{
806
0
  j->flags.reset_signal_handlers = 1;
807
0
}
808
809
void API minijail_namespace_vfs(struct minijail *j)
810
0
{
811
0
  j->flags.vfs = 1;
812
0
}
813
814
void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
815
0
{
816
  /* Note: Do not use O_CLOEXEC here.  We'll close it after we use it. */
817
0
  int ns_fd = open(ns_path, O_RDONLY);
818
0
  if (ns_fd < 0) {
819
0
    pdie("failed to open namespace '%s'", ns_path);
820
0
  }
821
0
  j->mountns_fd = ns_fd;
822
0
  j->flags.enter_vfs = 1;
823
0
}
824
825
void API minijail_new_session_keyring(struct minijail *j)
826
0
{
827
0
  j->flags.new_session_keyring = 1;
828
0
}
829
830
void API minijail_skip_setting_securebits(struct minijail *j,
831
            uint64_t securebits_skip_mask)
832
0
{
833
0
  j->securebits_skip_mask = securebits_skip_mask;
834
0
}
835
836
void API minijail_remount_mode(struct minijail *j, unsigned long mode)
837
0
{
838
0
  j->remount_mode = mode;
839
0
}
840
841
void API minijail_skip_remount_private(struct minijail *j)
842
0
{
843
0
  j->remount_mode = 0;
844
0
}
845
846
void API minijail_namespace_pids(struct minijail *j)
847
0
{
848
0
  j->flags.vfs = 1;
849
0
  j->flags.remount_proc_ro = 1;
850
0
  j->flags.pids = 1;
851
0
  j->flags.do_init = 1;
852
0
}
853
854
void API minijail_namespace_pids_rw_proc(struct minijail *j)
855
0
{
856
0
  j->flags.vfs = 1;
857
0
  j->flags.pids = 1;
858
0
  j->flags.do_init = 1;
859
0
}
860
861
void API minijail_namespace_ipc(struct minijail *j)
862
0
{
863
0
  j->flags.ipc = 1;
864
0
}
865
866
void API minijail_namespace_uts(struct minijail *j)
867
0
{
868
0
  j->flags.uts = 1;
869
0
}
870
871
int API minijail_namespace_set_hostname(struct minijail *j, const char *name)
872
0
{
873
0
  if (j->hostname)
874
0
    return -EINVAL;
875
0
  minijail_namespace_uts(j);
876
0
  j->hostname = strdup(name);
877
0
  if (!j->hostname)
878
0
    return -ENOMEM;
879
0
  return 0;
880
0
}
881
882
void API minijail_namespace_net_loopback(struct minijail *j,
883
           bool enable_loopback)
884
0
{
885
0
  j->flags.net = 1;
886
0
  j->flags.net_loopback = enable_loopback;
887
0
}
888
889
void API minijail_namespace_net(struct minijail *j)
890
0
{
891
0
  minijail_namespace_net_loopback(j, true);
892
0
}
893
894
void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
895
0
{
896
  /* Note: Do not use O_CLOEXEC here.  We'll close it after we use it. */
897
0
  int ns_fd = open(ns_path, O_RDONLY);
898
0
  if (ns_fd < 0) {
899
0
    pdie("failed to open namespace '%s'", ns_path);
900
0
  }
901
0
  j->netns_fd = ns_fd;
902
0
  j->flags.enter_net = 1;
903
0
}
904
905
void API minijail_namespace_cgroups(struct minijail *j)
906
0
{
907
0
  j->flags.ns_cgroups = 1;
908
0
}
909
910
void API minijail_close_open_fds(struct minijail *j)
911
0
{
912
0
  j->flags.close_open_fds = 1;
913
0
}
914
915
void API minijail_remount_proc_readonly(struct minijail *j)
916
0
{
917
0
  j->flags.vfs = 1;
918
0
  j->flags.remount_proc_ro = 1;
919
0
}
920
921
void API minijail_namespace_user(struct minijail *j)
922
0
{
923
0
  j->flags.userns = 1;
924
0
}
925
926
void API minijail_namespace_user_disable_setgroups(struct minijail *j)
927
0
{
928
0
  j->flags.disable_setgroups = 1;
929
0
}
930
931
int API minijail_uidmap(struct minijail *j, const char *uidmap)
932
0
{
933
0
  j->uidmap = strdup(uidmap);
934
0
  if (!j->uidmap)
935
0
    return -ENOMEM;
936
0
  char *ch;
937
0
  for (ch = j->uidmap; *ch; ch++) {
938
0
    if (*ch == ',')
939
0
      *ch = '\n';
940
0
  }
941
0
  return 0;
942
0
}
943
944
int API minijail_gidmap(struct minijail *j, const char *gidmap)
945
0
{
946
0
  j->gidmap = strdup(gidmap);
947
0
  if (!j->gidmap)
948
0
    return -ENOMEM;
949
0
  char *ch;
950
0
  for (ch = j->gidmap; *ch; ch++) {
951
0
    if (*ch == ',')
952
0
      *ch = '\n';
953
0
  }
954
0
  return 0;
955
0
}
956
957
void API minijail_inherit_usergroups(struct minijail *j)
958
0
{
959
0
  j->flags.inherit_suppl_gids = 1;
960
0
}
961
962
void API minijail_run_as_init(struct minijail *j)
963
0
{
964
  /*
965
   * Since the jailed program will become 'init' in the new PID namespace,
966
   * Minijail does not need to fork an 'init' process.
967
   */
968
0
  j->flags.run_as_init = 1;
969
0
}
970
971
int API minijail_enter_chroot(struct minijail *j, const char *dir)
972
0
{
973
0
  if (j->chrootdir)
974
0
    return -EINVAL;
975
0
  j->chrootdir = strdup(dir);
976
0
  if (!j->chrootdir)
977
0
    return -ENOMEM;
978
0
  j->flags.chroot = 1;
979
0
  return 0;
980
0
}
981
982
int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
983
0
{
984
0
  if (j->chrootdir)
985
0
    return -EINVAL;
986
0
  j->chrootdir = strdup(dir);
987
0
  if (!j->chrootdir)
988
0
    return -ENOMEM;
989
0
  j->flags.pivot_root = 1;
990
0
  return 0;
991
0
}
992
993
char API *minijail_get_original_path(struct minijail *j,
994
             const char *path_inside_chroot)
995
0
{
996
0
  struct mountpoint *b;
997
998
0
  b = j->mounts_head;
999
0
  while (b) {
1000
    /*
1001
     * If |path_inside_chroot| is the exact destination of a
1002
     * mount, then the original path is exactly the source of
1003
     * the mount.
1004
     *  for example: "-b /some/path/exe,/chroot/path/exe"
1005
     *    mount source = /some/path/exe, mount dest =
1006
     *    /chroot/path/exe Then when getting the original path of
1007
     *    "/chroot/path/exe", the source of that mount,
1008
     *    "/some/path/exe" is what should be returned.
1009
     */
1010
0
    if (streq(b->dest, path_inside_chroot))
1011
0
      return strdup(b->src);
1012
1013
    /*
1014
     * If |path_inside_chroot| is within the destination path of a
1015
     * mount, take the suffix of the chroot path relative to the
1016
     * mount destination path, and append it to the mount source
1017
     * path.
1018
     */
1019
0
    if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
1020
0
      const char *relative_path =
1021
0
          path_inside_chroot + strlen(b->dest);
1022
0
      return path_join(b->src, relative_path);
1023
0
    }
1024
0
    b = b->next;
1025
0
  }
1026
1027
  /* If there is a chroot path, append |path_inside_chroot| to that. */
1028
0
  if (j->chrootdir)
1029
0
    return path_join(j->chrootdir, path_inside_chroot);
1030
1031
  /* No chroot, so the path outside is the same as it is inside. */
1032
0
  return strdup(path_inside_chroot);
1033
0
}
1034
1035
void API minijail_mount_dev(struct minijail *j)
1036
0
{
1037
0
  j->flags.mount_dev = 1;
1038
0
}
1039
1040
void API minijail_mount_tmp(struct minijail *j)
1041
0
{
1042
0
  minijail_mount_tmp_size(j, 64 * 1024 * 1024);
1043
0
}
1044
1045
void API minijail_mount_tmp_size(struct minijail *j, size_t size)
1046
0
{
1047
0
  j->tmpfs_size = size;
1048
0
  j->flags.mount_tmp = 1;
1049
0
}
1050
1051
int API minijail_write_pid_file(struct minijail *j, const char *path)
1052
0
{
1053
0
  j->pid_file_path = strdup(path);
1054
0
  if (!j->pid_file_path)
1055
0
    return -ENOMEM;
1056
0
  j->flags.pid_file = 1;
1057
0
  return 0;
1058
0
}
1059
1060
int API minijail_add_to_cgroup(struct minijail *j, const char *path)
1061
0
{
1062
0
  if (j->cgroup_count >= MAX_CGROUPS)
1063
0
    return -ENOMEM;
1064
0
  j->cgroups[j->cgroup_count] = strdup(path);
1065
0
  if (!j->cgroups[j->cgroup_count])
1066
0
    return -ENOMEM;
1067
0
  j->cgroup_count++;
1068
0
  j->flags.cgroups = 1;
1069
0
  return 0;
1070
0
}
1071
1072
int API minijail_rlimit(struct minijail *j, int type, rlim_t cur, rlim_t max)
1073
0
{
1074
0
  size_t i;
1075
1076
0
  if (j->rlimit_count >= MAX_RLIMITS)
1077
0
    return -ENOMEM;
1078
  /* It's an error if the caller sets the same rlimit multiple times. */
1079
0
  for (i = 0; i < j->rlimit_count; i++) {
1080
0
    if (j->rlimits[i].type == type)
1081
0
      return -EEXIST;
1082
0
  }
1083
1084
0
  j->rlimits[j->rlimit_count].type = type;
1085
0
  j->rlimits[j->rlimit_count].cur = cur;
1086
0
  j->rlimits[j->rlimit_count].max = max;
1087
0
  j->rlimit_count++;
1088
0
  return 0;
1089
0
}
1090
1091
int API minijail_forward_signals(struct minijail *j)
1092
0
{
1093
0
  j->flags.forward_signals = 1;
1094
0
  return 0;
1095
0
}
1096
1097
int API minijail_create_session(struct minijail *j)
1098
0
{
1099
0
  j->flags.setsid = 1;
1100
0
  return 0;
1101
0
}
1102
1103
int API minijail_add_fs_restriction_rx(struct minijail *j, const char *path)
1104
0
{
1105
0
  return !add_fs_restriction_path(j, path,
1106
0
          ACCESS_FS_ROUGHLY_READ_EXECUTE);
1107
0
}
1108
1109
int API minijail_add_fs_restriction_ro(struct minijail *j, const char *path)
1110
0
{
1111
0
  return !add_fs_restriction_path(j, path, ACCESS_FS_ROUGHLY_READ);
1112
0
}
1113
1114
int API minijail_add_fs_restriction_rw(struct minijail *j, const char *path)
1115
0
{
1116
0
  return !add_fs_restriction_path(
1117
0
      j, path, ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_BASIC_WRITE);
1118
0
}
1119
1120
int API minijail_add_fs_restriction_advanced_rw(struct minijail *j,
1121
            const char *path)
1122
0
{
1123
0
  uint16_t landlock_flags =
1124
0
      ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_FULL_WRITE;
1125
0
  if (fs_refer_restriction_supported(j)) {
1126
0
    landlock_flags |= LANDLOCK_ACCESS_FS_REFER;
1127
0
  }
1128
1129
0
  return !add_fs_restriction_path(j, path, landlock_flags);
1130
0
}
1131
1132
int API minijail_add_fs_restriction_edit(struct minijail *j, const char *path)
1133
0
{
1134
0
  return !add_fs_restriction_path(
1135
0
      j, path, ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_EDIT);
1136
0
}
1137
1138
int API minijail_add_fs_restriction_access_rights(struct minijail *j,
1139
              const char *path,
1140
              uint16_t landlock_flags)
1141
0
{
1142
0
  return !add_fs_restriction_path(j, path, landlock_flags);
1143
0
}
1144
1145
bool API
1146
minijail_is_fs_restriction_ruleset_initialized(const struct minijail *j)
1147
0
{
1148
0
  return j->fs_rules_fd >= 0;
1149
0
}
1150
1151
static bool is_valid_bind_path(const char *path)
1152
0
{
1153
0
  if (!block_symlinks_in_bindmount_paths()) {
1154
0
    return true;
1155
0
  }
1156
1157
  /*
1158
   * tokenize() will modify both the |prefixes| pointer and the contents
1159
   * of the string, so:
1160
   * -Copy |BINDMOUNT_ALLOWED_PREFIXES| since it lives in .rodata.
1161
   * -Save the original pointer for free()ing.
1162
   */
1163
0
  char *prefixes = strdup(BINDMOUNT_ALLOWED_PREFIXES);
1164
0
  attribute_cleanup_str char *orig_prefixes = prefixes;
1165
0
  (void)orig_prefixes;
1166
1167
0
  char *prefix = NULL;
1168
0
  bool found_prefix = false;
1169
0
  if (!is_canonical_path(path)) {
1170
0
    while ((prefix = tokenize(&prefixes, ",")) != NULL) {
1171
0
      if (path_is_parent(prefix, path)) {
1172
0
        found_prefix = true;
1173
0
        break;
1174
0
      }
1175
0
    }
1176
0
    if (!found_prefix) {
1177
      /*
1178
       * If the path does not include one of the allowed
1179
       * prefixes, fail.
1180
       */
1181
0
      warn("path '%s' is not a canonical path", path);
1182
0
      return false;
1183
0
    }
1184
0
  }
1185
0
  return true;
1186
0
}
1187
1188
int API minijail_mount_with_data(struct minijail *j, const char *src,
1189
         const char *dest, const char *type,
1190
         unsigned long flags, const char *data)
1191
0
{
1192
0
  struct mountpoint *m;
1193
1194
0
  if (*dest != '/')
1195
0
    return -EINVAL;
1196
0
  m = calloc(1, sizeof(*m));
1197
0
  if (!m)
1198
0
    return -ENOMEM;
1199
0
  m->dest = strdup(dest);
1200
0
  if (!m->dest)
1201
0
    goto error;
1202
0
  m->src = strdup(src);
1203
0
  if (!m->src)
1204
0
    goto error;
1205
0
  m->type = strdup(type);
1206
0
  if (!m->type)
1207
0
    goto error;
1208
1209
0
  if (!data || !data[0]) {
1210
    /*
1211
     * Set up secure defaults for certain filesystems.  Adding this
1212
     * fs-specific logic here kind of sucks, but considering how
1213
     * people use these in practice, it's probably OK.  If they want
1214
     * the kernel defaults, they can pass data="" instead of NULL.
1215
     */
1216
0
    if (streq(type, "tmpfs")) {
1217
      /* tmpfs defaults to mode=1777 and size=50%. */
1218
0
      data = "mode=0755,size=10M";
1219
0
    }
1220
0
  }
1221
0
  if (data) {
1222
0
    m->data = strdup(data);
1223
0
    if (!m->data)
1224
0
      goto error;
1225
0
    m->has_data = 1;
1226
0
  }
1227
1228
  /* If they don't specify any flags, default to secure ones. */
1229
0
  if (flags == 0)
1230
0
    flags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
1231
0
  m->flags = flags;
1232
1233
  /*
1234
   * Unless asked to enter an existing namespace, force vfs namespacing
1235
   * so the mounts don't leak out into the containing vfs namespace.
1236
   * If Minijail is being asked to enter the root vfs namespace this will
1237
   * leak mounts, but it's unlikely that the user would ask to do that by
1238
   * mistake.
1239
   */
1240
0
  if (!j->flags.enter_vfs)
1241
0
    minijail_namespace_vfs(j);
1242
1243
0
  if (j->mounts_tail)
1244
0
    j->mounts_tail->next = m;
1245
0
  else
1246
0
    j->mounts_head = m;
1247
0
  j->mounts_tail = m;
1248
0
  j->mounts_count++;
1249
1250
0
  return 0;
1251
1252
0
error:
1253
0
  free(m->type);
1254
0
  free(m->src);
1255
0
  free(m->dest);
1256
0
  free(m);
1257
0
  return -ENOMEM;
1258
0
}
1259
1260
int API minijail_mount(struct minijail *j, const char *src, const char *dest,
1261
           const char *type, unsigned long flags)
1262
0
{
1263
0
  return minijail_mount_with_data(j, src, dest, type, flags, NULL);
1264
0
}
1265
1266
int API minijail_bind(struct minijail *j, const char *src, const char *dest,
1267
          int writeable)
1268
0
{
1269
0
  unsigned long flags = MS_BIND;
1270
1271
  /*
1272
   * Check for symlinks in bind-mount source paths to warn the user early.
1273
   * Minijail will perform one final check immediately before the mount()
1274
   * call.
1275
   */
1276
0
  if (!is_valid_bind_path(src)) {
1277
0
    warn("src '%s' is not a valid bind mount path", src);
1278
0
    return -ELOOP;
1279
0
  }
1280
1281
  /*
1282
   * Symlinks in |dest| are blocked by the ChromiumOS LSM:
1283
   * <kernel>/security/chromiumos/lsm.c#77
1284
   */
1285
1286
0
  if (!writeable)
1287
0
    flags |= MS_RDONLY;
1288
1289
  /*
1290
   * |type| is ignored for bind mounts, use it to signal that this mount
1291
   * came from minijail_bind().
1292
   * TODO(b/238362528): Implement a better way to signal this.
1293
   */
1294
0
  return minijail_mount(j, src, dest, "minijail_bind", flags);
1295
0
}
1296
1297
int API minijail_add_remount(struct minijail *j, const char *mount_name,
1298
           unsigned long remount_mode)
1299
0
{
1300
0
  struct minijail_remount *m;
1301
1302
0
  if (*mount_name != '/')
1303
0
    return -EINVAL;
1304
0
  m = calloc(1, sizeof(*m));
1305
0
  if (!m)
1306
0
    return -ENOMEM;
1307
0
  m->mount_name = strdup(mount_name);
1308
0
  if (!m->mount_name) {
1309
0
    free(m);
1310
0
    return -ENOMEM;
1311
0
  }
1312
1313
0
  m->remount_mode = remount_mode;
1314
1315
0
  if (j->remounts_tail)
1316
0
    j->remounts_tail->next = m;
1317
0
  else
1318
0
    j->remounts_head = m;
1319
0
  j->remounts_tail = m;
1320
1321
0
  return 0;
1322
0
}
1323
1324
int API minijail_add_hook(struct minijail *j, minijail_hook_t hook,
1325
        void *payload, minijail_hook_event_t event)
1326
0
{
1327
0
  struct hook *c;
1328
1329
0
  if (event >= MINIJAIL_HOOK_EVENT_MAX)
1330
0
    return -EINVAL;
1331
0
  c = calloc(1, sizeof(*c));
1332
0
  if (!c)
1333
0
    return -ENOMEM;
1334
1335
0
  c->hook = hook;
1336
0
  c->payload = payload;
1337
0
  c->event = event;
1338
1339
0
  if (j->hooks_tail)
1340
0
    j->hooks_tail->next = c;
1341
0
  else
1342
0
    j->hooks_head = c;
1343
0
  j->hooks_tail = c;
1344
1345
0
  return 0;
1346
0
}
1347
1348
int API minijail_preserve_fd(struct minijail *j, int parent_fd, int child_fd)
1349
0
{
1350
0
  if (parent_fd < 0 || child_fd < 0)
1351
0
    return -EINVAL;
1352
0
  if (j->preserved_fd_count >= MAX_PRESERVED_FDS)
1353
0
    return -ENOMEM;
1354
0
  j->preserved_fds[j->preserved_fd_count].parent_fd = parent_fd;
1355
0
  j->preserved_fds[j->preserved_fd_count].child_fd = child_fd;
1356
0
  j->preserved_fd_count++;
1357
0
  return 0;
1358
0
}
1359
1360
int API minijail_set_preload_path(struct minijail *j, const char *preload_path)
1361
0
{
1362
0
  if (j->preload_path)
1363
0
    return -EINVAL;
1364
0
  j->preload_path = strdup(preload_path);
1365
0
  if (!j->preload_path)
1366
0
    return -ENOMEM;
1367
0
  return 0;
1368
0
}
1369
1370
static void clear_seccomp_options(struct minijail *j)
1371
0
{
1372
0
  j->flags.seccomp_filter = 0;
1373
0
  j->flags.seccomp_filter_tsync = 0;
1374
0
  j->flags.seccomp_filter_logging = 0;
1375
0
  j->flags.seccomp_filter_allow_speculation = 0;
1376
0
  j->filter_len = 0;
1377
0
  j->filter_prog = NULL;
1378
0
  j->flags.no_new_privs = 0;
1379
0
  if (j->seccomp_policy_path) {
1380
0
    free(j->seccomp_policy_path);
1381
0
  }
1382
0
  j->seccomp_policy_path = NULL;
1383
0
}
1384
1385
static int seccomp_should_use_filters(struct minijail *j)
1386
0
{
1387
0
  if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) {
1388
    /*
1389
     * |errno| will be set to EINVAL when seccomp has not been
1390
     * compiled into the kernel. On certain platforms and kernel
1391
     * versions this is not a fatal failure. In that case, and only
1392
     * in that case, disable seccomp and skip loading the filters.
1393
     */
1394
0
    if ((errno == EINVAL) && seccomp_can_softfail()) {
1395
0
      warn("not loading seccomp filters, seccomp filter not "
1396
0
           "supported");
1397
0
      clear_seccomp_options(j);
1398
0
      return 0;
1399
0
    }
1400
    /*
1401
     * If |errno| != EINVAL or seccomp_can_softfail() is false,
1402
     * we can proceed. Worst case scenario minijail_enter() will
1403
     * abort() if seccomp fails.
1404
     */
1405
0
  }
1406
0
  if (j->flags.seccomp_filter_tsync) {
1407
    /* Are the seccomp(2) syscall and the TSYNC option supported? */
1408
0
    if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
1409
0
        SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) {
1410
0
      int saved_errno = errno;
1411
0
      if (saved_errno == ENOSYS && seccomp_can_softfail()) {
1412
0
        warn("seccomp(2) syscall not supported");
1413
0
        clear_seccomp_options(j);
1414
0
        return 0;
1415
0
      } else if (saved_errno == EINVAL &&
1416
0
           seccomp_can_softfail()) {
1417
0
        warn(
1418
0
            "seccomp filter thread sync not supported");
1419
0
        clear_seccomp_options(j);
1420
0
        return 0;
1421
0
      }
1422
      /*
1423
       * Similar logic here. If seccomp_can_softfail() is
1424
       * false, or |errno| != ENOSYS, or |errno| != EINVAL,
1425
       * we can proceed. Worst case scenario minijail_enter()
1426
       * will abort() if seccomp or TSYNC fail.
1427
       */
1428
0
    }
1429
0
  }
1430
0
  if (j->flags.seccomp_filter_allow_speculation) {
1431
    /* Is the SPEC_ALLOW flag supported? */
1432
0
    if (!seccomp_filter_flags_available(
1433
0
      SECCOMP_FILTER_FLAG_SPEC_ALLOW)) {
1434
0
      warn("allowing speculative execution on seccomp "
1435
0
           "processes not supported");
1436
0
      j->flags.seccomp_filter_allow_speculation = 0;
1437
0
    }
1438
0
  }
1439
0
  return 1;
1440
0
}
1441
1442
static int set_seccomp_filters_internal(struct minijail *j,
1443
          const struct sock_fprog *filter,
1444
          bool owned)
1445
0
{
1446
0
  struct sock_fprog *fprog;
1447
1448
0
  if (owned) {
1449
    /*
1450
     * If |owned| is true, it's OK to cast away the const-ness since
1451
     * we'll own the pointer going forward.
1452
     */
1453
0
    fprog = (struct sock_fprog *)filter;
1454
0
  } else {
1455
0
    fprog = malloc(sizeof(struct sock_fprog));
1456
0
    if (!fprog)
1457
0
      return -ENOMEM;
1458
0
    fprog->len = filter->len;
1459
0
    fprog->filter = malloc(sizeof(struct sock_filter) * fprog->len);
1460
0
    if (!fprog->filter) {
1461
0
      free(fprog);
1462
0
      return -ENOMEM;
1463
0
    }
1464
0
    memcpy(fprog->filter, filter->filter,
1465
0
           sizeof(struct sock_filter) * fprog->len);
1466
0
  }
1467
1468
0
  if (j->filter_prog) {
1469
0
    free(j->filter_prog->filter);
1470
0
    free(j->filter_prog);
1471
0
  }
1472
1473
0
  j->filter_len = fprog->len;
1474
0
  j->filter_prog = fprog;
1475
0
  return 0;
1476
0
}
1477
1478
static int parse_seccomp_filters(struct minijail *j, const char *filename,
1479
         FILE *policy_file)
1480
0
{
1481
0
  struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
1482
0
  if (!fprog)
1483
0
    return -ENOMEM;
1484
1485
0
  struct filter_options filteropts;
1486
1487
  /*
1488
   * Figure out filter options.
1489
   * Allow logging?
1490
   */
1491
0
  filteropts.allow_logging =
1492
0
      debug_logging_allowed() && seccomp_is_logging_allowed(j);
1493
1494
  /* What to do on a blocked system call? */
1495
0
  if (filteropts.allow_logging) {
1496
0
    if (seccomp_ret_log_available())
1497
0
      filteropts.action = ACTION_RET_LOG;
1498
0
    else
1499
0
      filteropts.action = ACTION_RET_TRAP;
1500
0
  } else {
1501
0
    if (j->flags.seccomp_filter_tsync) {
1502
0
      if (seccomp_ret_kill_process_available()) {
1503
0
        filteropts.action = ACTION_RET_KILL_PROCESS;
1504
0
      } else {
1505
0
        filteropts.action = ACTION_RET_TRAP;
1506
0
      }
1507
0
    } else {
1508
0
      filteropts.action = ACTION_RET_KILL;
1509
0
    }
1510
0
  }
1511
1512
  /*
1513
   * If SECCOMP_RET_LOG is not available, need to allow extra syscalls
1514
   * for logging.
1515
   */
1516
0
  filteropts.allow_syscalls_for_logging =
1517
0
      filteropts.allow_logging && !seccomp_ret_log_available();
1518
1519
  /* Whether to also allow syscalls for libc compatibility. */
1520
0
  filteropts.include_libc_compatibility_allowlist =
1521
0
      allow_libc_compatibility_syscalls();
1522
1523
  /* Whether to fail on duplicate syscalls. */
1524
0
  filteropts.allow_duplicate_syscalls = allow_duplicate_syscalls();
1525
1526
0
  if (compile_filter(filename, policy_file, fprog, &filteropts)) {
1527
0
    free(fprog);
1528
0
    return -1;
1529
0
  }
1530
1531
0
  return set_seccomp_filters_internal(j, fprog, true /* owned */);
1532
0
}
1533
1534
void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
1535
0
{
1536
0
  if (!seccomp_should_use_filters(j))
1537
0
    return;
1538
1539
0
  attribute_cleanup_fp FILE *file = fopen(path, "re");
1540
0
  if (!file) {
1541
0
    pdie("failed to open seccomp filter file '%s'", path);
1542
0
  }
1543
1544
0
  if (parse_seccomp_filters(j, path, file) != 0) {
1545
0
    die("failed to compile seccomp filter BPF program in '%s'",
1546
0
        path);
1547
0
  }
1548
0
  if (j->seccomp_policy_path) {
1549
0
    free(j->seccomp_policy_path);
1550
0
  }
1551
0
  j->seccomp_policy_path = strdup(path);
1552
0
}
1553
1554
void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd)
1555
0
{
1556
0
  char *fd_path, *path;
1557
0
  attribute_cleanup_fp FILE *file = NULL;
1558
1559
0
  if (!seccomp_should_use_filters(j))
1560
0
    return;
1561
1562
0
  file = fdopen(fd, "r");
1563
0
  if (!file) {
1564
0
    pdie("failed to associate stream with fd %d", fd);
1565
0
  }
1566
1567
0
  if (asprintf(&fd_path, "/proc/self/fd/%d", fd) == -1)
1568
0
    pdie("failed to create path for fd %d", fd);
1569
0
  path = realpath(fd_path, NULL);
1570
0
  if (path == NULL)
1571
0
    pwarn("failed to get path of fd %d", fd);
1572
0
  free(fd_path);
1573
1574
0
  if (parse_seccomp_filters(j, path ? path : "<fd>", file) != 0) {
1575
0
    die("failed to compile seccomp filter BPF program from fd %d",
1576
0
        fd);
1577
0
  }
1578
0
  if (j->seccomp_policy_path) {
1579
0
    free(j->seccomp_policy_path);
1580
0
  }
1581
0
  j->seccomp_policy_path = path;
1582
0
}
1583
1584
void API minijail_set_seccomp_filters(struct minijail *j,
1585
              const struct sock_fprog *filter)
1586
0
{
1587
0
  if (!seccomp_should_use_filters(j))
1588
0
    return;
1589
1590
0
  if (seccomp_is_logging_allowed(j)) {
1591
0
    die("minijail_log_seccomp_filter_failures() is incompatible "
1592
0
        "with minijail_set_seccomp_filters()");
1593
0
  }
1594
1595
  /*
1596
   * set_seccomp_filters_internal() can only fail with ENOMEM.
1597
   * Furthermore, since we won't own the incoming filter, it will not be
1598
   * modified.
1599
   */
1600
0
  if (set_seccomp_filters_internal(j, filter, false /* owned */) < 0) {
1601
0
    die("failed to set seccomp filter");
1602
0
  }
1603
0
}
1604
1605
int API minijail_use_alt_syscall(struct minijail *j, const char *table)
1606
0
{
1607
0
  j->alt_syscall_table = strdup(table);
1608
0
  if (!j->alt_syscall_table)
1609
0
    return -ENOMEM;
1610
0
  j->flags.alt_syscall = 1;
1611
0
  return 0;
1612
0
}
1613
1614
struct marshal_state {
1615
  size_t available;
1616
  size_t total;
1617
  char *buf;
1618
};
1619
1620
static void marshal_state_init(struct marshal_state *state, char *buf,
1621
             size_t available)
1622
0
{
1623
0
  state->available = available;
1624
0
  state->buf = buf;
1625
0
  state->total = 0;
1626
0
}
1627
1628
static void marshal_append(struct marshal_state *state, const void *src,
1629
         size_t length)
1630
0
{
1631
0
  size_t copy_len = MIN(state->available, length);
1632
1633
  /* Up to |available| will be written. */
1634
0
  if (copy_len) {
1635
0
    memcpy(state->buf, src, copy_len);
1636
0
    state->buf += copy_len;
1637
0
    state->available -= copy_len;
1638
0
  }
1639
  /* |total| will contain the expected length. */
1640
0
  state->total += length;
1641
0
}
1642
1643
static void marshal_append_string(struct marshal_state *state, const char *src)
1644
0
{
1645
0
  marshal_append(state, src, strlen(src) + 1);
1646
0
}
1647
1648
static void marshal_mount(struct marshal_state *state,
1649
        const struct mountpoint *m)
1650
0
{
1651
0
  marshal_append(state, m->src, strlen(m->src) + 1);
1652
0
  marshal_append(state, m->dest, strlen(m->dest) + 1);
1653
0
  marshal_append(state, m->type, strlen(m->type) + 1);
1654
0
  marshal_append(state, (char *)&m->has_data, sizeof(m->has_data));
1655
0
  if (m->has_data)
1656
0
    marshal_append(state, m->data, strlen(m->data) + 1);
1657
0
  marshal_append(state, (char *)&m->flags, sizeof(m->flags));
1658
0
}
1659
1660
static void marshal_fs_rule(struct marshal_state *state,
1661
          const struct fs_rule *r)
1662
0
{
1663
0
  marshal_append(state, r->path, strlen(r->path) + 1);
1664
0
  marshal_append(state, (char *)&r->landlock_flags,
1665
0
           sizeof(r->landlock_flags));
1666
0
}
1667
1668
static void minijail_marshal_helper(struct marshal_state *state,
1669
            const struct minijail *j)
1670
0
{
1671
0
  struct mountpoint *m = NULL;
1672
0
  struct fs_rule *r = NULL;
1673
0
  size_t i;
1674
1675
0
  marshal_append(state, (char *)j, sizeof(*j));
1676
0
  if (j->user)
1677
0
    marshal_append_string(state, j->user);
1678
0
  if (j->suppl_gid_list) {
1679
0
    marshal_append(state, j->suppl_gid_list,
1680
0
             j->suppl_gid_count * sizeof(gid_t));
1681
0
  }
1682
0
  if (j->chrootdir)
1683
0
    marshal_append_string(state, j->chrootdir);
1684
0
  if (j->hostname)
1685
0
    marshal_append_string(state, j->hostname);
1686
0
  if (j->alt_syscall_table) {
1687
0
    marshal_append(state, j->alt_syscall_table,
1688
0
             strlen(j->alt_syscall_table) + 1);
1689
0
  }
1690
0
  if (j->flags.seccomp_filter && j->filter_prog) {
1691
0
    struct sock_fprog *fp = j->filter_prog;
1692
0
    marshal_append(state, (char *)fp->filter,
1693
0
             fp->len * sizeof(struct sock_filter));
1694
0
  }
1695
0
  for (m = j->mounts_head; m; m = m->next) {
1696
0
    marshal_mount(state, m);
1697
0
  }
1698
0
  for (i = 0; i < j->cgroup_count; ++i)
1699
0
    marshal_append_string(state, j->cgroups[i]);
1700
0
  for (r = j->fs_rules_head; r; r = r->next)
1701
0
    marshal_fs_rule(state, r);
1702
0
  marshal_append(state, (char *)&j->fs_rules_fd, sizeof(j->fs_rules_fd));
1703
0
  if (j->seccomp_policy_path)
1704
0
    marshal_append_string(state, j->seccomp_policy_path);
1705
0
}
1706
1707
size_t API minijail_size(const struct minijail *j)
1708
0
{
1709
0
  struct marshal_state state;
1710
0
  marshal_state_init(&state, NULL, 0);
1711
0
  minijail_marshal_helper(&state, j);
1712
0
  return state.total;
1713
0
}
1714
1715
int minijail_marshal(const struct minijail *j, char *buf, size_t available)
1716
0
{
1717
0
  struct marshal_state state;
1718
0
  marshal_state_init(&state, buf, available);
1719
0
  minijail_marshal_helper(&state, j);
1720
0
  return (state.total > available);
1721
0
}
1722
1723
int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
1724
0
{
1725
0
  size_t i;
1726
0
  size_t count;
1727
0
  size_t fs_rules_count;
1728
0
  int ret = -EINVAL;
1729
1730
0
  if (length < sizeof(*j))
1731
0
    goto out;
1732
0
  memcpy((void *)j, serialized, sizeof(*j));
1733
0
  serialized += sizeof(*j);
1734
0
  length -= sizeof(*j);
1735
1736
  /* Potentially stale pointers not used as signals. */
1737
0
  j->preload_path = NULL;
1738
0
  j->filename = NULL;
1739
0
  j->pid_file_path = NULL;
1740
0
  j->uidmap = NULL;
1741
0
  j->gidmap = NULL;
1742
0
  j->mounts_head = NULL;
1743
0
  j->mounts_tail = NULL;
1744
0
  j->remounts_head = NULL;
1745
0
  j->remounts_tail = NULL;
1746
0
  j->filter_prog = NULL;
1747
0
  j->hooks_head = NULL;
1748
0
  j->hooks_tail = NULL;
1749
0
  j->fs_rules_head = NULL;
1750
0
  j->fs_rules_tail = NULL;
1751
1752
0
  if (j->user) { /* stale pointer */
1753
0
    char *user = consumestr(&serialized, &length);
1754
0
    if (!user)
1755
0
      goto clear_pointers;
1756
0
    j->user = strdup(user);
1757
0
    if (!j->user)
1758
0
      goto clear_pointers;
1759
0
  }
1760
1761
0
  if (j->suppl_gid_list) { /* stale pointer */
1762
0
    if (j->suppl_gid_count > NGROUPS_MAX) {
1763
0
      goto bad_gid_list;
1764
0
    }
1765
0
    size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
1766
0
    void *gid_list_bytes =
1767
0
        consumebytes(gid_list_size, &serialized, &length);
1768
0
    if (!gid_list_bytes)
1769
0
      goto bad_gid_list;
1770
1771
0
    j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
1772
0
    if (!j->suppl_gid_list)
1773
0
      goto bad_gid_list;
1774
1775
0
    memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
1776
0
  }
1777
1778
0
  if (j->chrootdir) { /* stale pointer */
1779
0
    char *chrootdir = consumestr(&serialized, &length);
1780
0
    if (!chrootdir)
1781
0
      goto bad_chrootdir;
1782
0
    j->chrootdir = strdup(chrootdir);
1783
0
    if (!j->chrootdir)
1784
0
      goto bad_chrootdir;
1785
0
  }
1786
1787
0
  if (j->hostname) { /* stale pointer */
1788
0
    char *hostname = consumestr(&serialized, &length);
1789
0
    if (!hostname)
1790
0
      goto bad_hostname;
1791
0
    j->hostname = strdup(hostname);
1792
0
    if (!j->hostname)
1793
0
      goto bad_hostname;
1794
0
  }
1795
1796
0
  if (j->alt_syscall_table) { /* stale pointer */
1797
0
    char *alt_syscall_table = consumestr(&serialized, &length);
1798
0
    if (!alt_syscall_table)
1799
0
      goto bad_syscall_table;
1800
0
    j->alt_syscall_table = strdup(alt_syscall_table);
1801
0
    if (!j->alt_syscall_table)
1802
0
      goto bad_syscall_table;
1803
0
  }
1804
1805
0
  if (j->flags.seccomp_filter && j->filter_len > 0) {
1806
0
    size_t ninstrs = j->filter_len;
1807
0
    if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
1808
0
        ninstrs > USHRT_MAX)
1809
0
      goto bad_filters;
1810
1811
0
    size_t program_len = ninstrs * sizeof(struct sock_filter);
1812
0
    void *program = consumebytes(program_len, &serialized, &length);
1813
0
    if (!program)
1814
0
      goto bad_filters;
1815
1816
0
    j->filter_prog = malloc(sizeof(struct sock_fprog));
1817
0
    if (!j->filter_prog)
1818
0
      goto bad_filters;
1819
1820
0
    j->filter_prog->len = ninstrs;
1821
0
    j->filter_prog->filter = malloc(program_len);
1822
0
    if (!j->filter_prog->filter)
1823
0
      goto bad_filter_prog_instrs;
1824
1825
0
    memcpy(j->filter_prog->filter, program, program_len);
1826
0
  }
1827
1828
0
  count = j->mounts_count;
1829
0
  j->mounts_count = 0;
1830
0
  for (i = 0; i < count; ++i) {
1831
0
    unsigned long *flags;
1832
0
    int *has_data;
1833
0
    const char *dest;
1834
0
    const char *type;
1835
0
    const char *data = NULL;
1836
0
    const char *src = consumestr(&serialized, &length);
1837
0
    if (!src)
1838
0
      goto bad_mounts;
1839
0
    dest = consumestr(&serialized, &length);
1840
0
    if (!dest)
1841
0
      goto bad_mounts;
1842
0
    type = consumestr(&serialized, &length);
1843
0
    if (!type)
1844
0
      goto bad_mounts;
1845
0
    has_data =
1846
0
        consumebytes(sizeof(*has_data), &serialized, &length);
1847
0
    if (!has_data)
1848
0
      goto bad_mounts;
1849
0
    if (*has_data) {
1850
0
      data = consumestr(&serialized, &length);
1851
0
      if (!data)
1852
0
        goto bad_mounts;
1853
0
    }
1854
0
    flags = consumebytes(sizeof(*flags), &serialized, &length);
1855
0
    if (!flags)
1856
0
      goto bad_mounts;
1857
0
    if (minijail_mount_with_data(j, src, dest, type, *flags, data))
1858
0
      goto bad_mounts;
1859
0
  }
1860
1861
0
  count = j->cgroup_count;
1862
0
  j->cgroup_count = 0;
1863
0
  for (i = 0; i < count; ++i) {
1864
0
    char *cgroup = consumestr(&serialized, &length);
1865
0
    if (!cgroup)
1866
0
      goto bad_cgroups;
1867
0
    j->cgroups[i] = strdup(cgroup);
1868
0
    if (!j->cgroups[i])
1869
0
      goto bad_cgroups;
1870
0
    ++j->cgroup_count;
1871
0
  }
1872
1873
  /* Unmarshal fs_rules. */
1874
0
  fs_rules_count = j->fs_rules_count;
1875
0
  j->fs_rules_count = 0;
1876
0
  for (i = 0; i < fs_rules_count; ++i) {
1877
0
    const char *path = consumestr(&serialized, &length);
1878
0
    uint64_t landlock_flags;
1879
0
    void *landlock_flags_bytes =
1880
0
        consumebytes(sizeof(landlock_flags), &serialized, &length);
1881
1882
0
    if (!path)
1883
0
      goto bad_fs_rules;
1884
0
    memcpy(&landlock_flags, landlock_flags_bytes,
1885
0
           sizeof(landlock_flags));
1886
0
    if (!landlock_flags)
1887
0
      goto bad_fs_rules;
1888
0
    if (add_fs_restriction_path(j, path, landlock_flags))
1889
0
      goto bad_fs_rules;
1890
0
  }
1891
  /* Unmarshal fs_rules_fd. */
1892
0
  void *fs_rules_fd_bytes =
1893
0
      consumebytes(sizeof(j->fs_rules_fd), &serialized, &length);
1894
0
  memcpy(&j->fs_rules_fd, fs_rules_fd_bytes, sizeof(j->fs_rules_fd));
1895
0
  if (!j->fs_rules_fd)
1896
0
    goto bad_cgroups;
1897
1898
0
  if (j->seccomp_policy_path) { /* stale pointer */
1899
0
    char *seccomp_policy_path = consumestr(&serialized, &length);
1900
0
    if (!seccomp_policy_path)
1901
0
      goto bad_cgroups;
1902
0
    j->seccomp_policy_path = strdup(seccomp_policy_path);
1903
0
    if (!j->seccomp_policy_path)
1904
0
      goto bad_cgroups;
1905
0
  }
1906
1907
0
  return 0;
1908
1909
  /*
1910
   * If more is added after j->seccomp_policy_path, then this is needed:
1911
   * if (j->seccomp_policy_path)
1912
   *  free(j->seccomp_policy_path);
1913
   */
1914
1915
0
bad_cgroups:
1916
0
  free_mounts_list(j);
1917
0
  free_remounts_list(j);
1918
0
  for (i = 0; i < j->cgroup_count; ++i)
1919
0
    free(j->cgroups[i]);
1920
0
bad_fs_rules:
1921
0
  free_fs_rules_list(j);
1922
0
bad_mounts:
1923
0
  if (j->filter_prog && j->filter_prog->filter)
1924
0
    free(j->filter_prog->filter);
1925
0
bad_filter_prog_instrs:
1926
0
  if (j->filter_prog)
1927
0
    free(j->filter_prog);
1928
0
bad_filters:
1929
0
  if (j->alt_syscall_table)
1930
0
    free(j->alt_syscall_table);
1931
0
bad_syscall_table:
1932
0
  if (j->hostname)
1933
0
    free(j->hostname);
1934
0
bad_hostname:
1935
0
  if (j->chrootdir)
1936
0
    free(j->chrootdir);
1937
0
bad_chrootdir:
1938
0
  if (j->suppl_gid_list)
1939
0
    free(j->suppl_gid_list);
1940
0
bad_gid_list:
1941
0
  if (j->user)
1942
0
    free(j->user);
1943
0
clear_pointers:
1944
0
  j->user = NULL;
1945
0
  j->suppl_gid_list = NULL;
1946
0
  j->chrootdir = NULL;
1947
0
  j->hostname = NULL;
1948
0
  j->alt_syscall_table = NULL;
1949
0
  j->cgroup_count = 0;
1950
0
  j->fs_rules_count = 0;
1951
0
  j->seccomp_policy_path = NULL;
1952
0
out:
1953
0
  return ret;
1954
0
}
1955
1956
struct dev_spec {
1957
  const char *name;
1958
  mode_t mode;
1959
  dev_t major, minor;
1960
};
1961
1962
// clang-format off
1963
static const struct dev_spec device_nodes[] = {
1964
    {
1965
"null",
1966
  S_IFCHR | 0666, 1, 3,
1967
    },
1968
    {
1969
  "zero",
1970
  S_IFCHR | 0666, 1, 5,
1971
    },
1972
    {
1973
  "full",
1974
  S_IFCHR | 0666, 1, 7,
1975
    },
1976
    {
1977
  "urandom",
1978
  S_IFCHR | 0444, 1, 9,
1979
    },
1980
    {
1981
  "tty",
1982
  S_IFCHR | 0666, 5, 0,
1983
    },
1984
};
1985
// clang-format on
1986
1987
struct dev_sym_spec {
1988
  const char *source, *dest;
1989
};
1990
1991
static const struct dev_sym_spec device_symlinks[] = {
1992
    {
1993
  "ptmx",
1994
  "pts/ptmx",
1995
    },
1996
    {
1997
  "fd",
1998
  "/proc/self/fd",
1999
    },
2000
    {
2001
  "stdin",
2002
  "fd/0",
2003
    },
2004
    {
2005
  "stdout",
2006
  "fd/1",
2007
    },
2008
    {
2009
  "stderr",
2010
  "fd/2",
2011
    },
2012
};
2013
2014
/*
2015
 * Clean up the temporary dev path we had setup previously.  In case of errors,
2016
 * we don't want to go leaking empty tempdirs.
2017
 */
2018
static void mount_dev_cleanup(char *dev_path)
2019
0
{
2020
0
  umount2(dev_path, MNT_DETACH);
2021
0
  rmdir(dev_path);
2022
0
  free(dev_path);
2023
0
}
2024
2025
/*
2026
 * Set up the pseudo /dev path at the temporary location.
2027
 * See mount_dev_finalize for more details.
2028
 */
2029
static int mount_dev(char **dev_path_ret)
2030
0
{
2031
0
  int ret;
2032
0
  attribute_cleanup_fd int dev_fd = -1;
2033
0
  size_t i;
2034
0
  mode_t mask;
2035
0
  char *dev_path;
2036
2037
  /*
2038
   * Create a temp path for the /dev init.  We'll relocate this to the
2039
   * final location later on in the startup process.
2040
   */
2041
0
  dev_path = *dev_path_ret = strdup("/tmp/minijail.dev.XXXXXX");
2042
0
  if (dev_path == NULL || mkdtemp(dev_path) == NULL)
2043
0
    pdie("could not create temp path for /dev");
2044
2045
  /* Set up the empty /dev mount point first. */
2046
0
  ret = mount("minijail-devfs", dev_path, "tmpfs", MS_NOEXEC | MS_NOSUID,
2047
0
        "size=5M,mode=755");
2048
0
  if (ret) {
2049
0
    rmdir(dev_path);
2050
0
    return ret;
2051
0
  }
2052
2053
  /* We want to set the mode directly from the spec. */
2054
0
  mask = umask(0);
2055
2056
  /* Get a handle to the temp dev path for *at funcs below. */
2057
0
  dev_fd = open(dev_path, O_DIRECTORY | O_PATH | O_CLOEXEC);
2058
0
  if (dev_fd < 0) {
2059
0
    ret = 1;
2060
0
    goto done;
2061
0
  }
2062
2063
  /* Create all the nodes in /dev. */
2064
0
  for (i = 0; i < ARRAY_SIZE(device_nodes); ++i) {
2065
0
    const struct dev_spec *ds = &device_nodes[i];
2066
0
    ret = mknodat(dev_fd, ds->name, ds->mode,
2067
0
            makedev(ds->major, ds->minor));
2068
0
    if (ret)
2069
0
      goto done;
2070
0
  }
2071
2072
  /* Create all the symlinks in /dev. */
2073
0
  for (i = 0; i < ARRAY_SIZE(device_symlinks); ++i) {
2074
0
    const struct dev_sym_spec *ds = &device_symlinks[i];
2075
0
    ret = symlinkat(ds->dest, dev_fd, ds->source);
2076
0
    if (ret)
2077
0
      goto done;
2078
0
  }
2079
2080
  /* Create empty dir for glibc shared mem APIs. */
2081
0
  ret = mkdirat(dev_fd, "shm", 01777);
2082
0
  if (ret)
2083
0
    goto done;
2084
2085
  /* Restore old mask. */
2086
0
done:
2087
0
  umask(mask);
2088
2089
0
  if (ret)
2090
0
    mount_dev_cleanup(dev_path);
2091
2092
0
  return ret;
2093
0
}
2094
2095
/*
2096
 * Relocate the temporary /dev mount to its final /dev place.
2097
 * We have to do this two step process so people can bind mount extra
2098
 * /dev paths like /dev/log.
2099
 */
2100
static int mount_dev_finalize(const struct minijail *j, char *dev_path)
2101
0
{
2102
0
  int ret = -1;
2103
0
  char *dest = NULL;
2104
2105
  /* Unmount the /dev mount if possible. */
2106
0
  if (umount2("/dev", MNT_DETACH))
2107
0
    goto done;
2108
2109
0
  if (asprintf(&dest, "%s/dev", j->chrootdir ?: "") < 0)
2110
0
    goto done;
2111
2112
0
  if (mount(dev_path, dest, NULL, MS_MOVE, NULL))
2113
0
    goto done;
2114
2115
0
  ret = 0;
2116
0
done:
2117
0
  free(dest);
2118
0
  mount_dev_cleanup(dev_path);
2119
2120
0
  return ret;
2121
0
}
2122
2123
/*
2124
 * mount_one: Applies mounts from @m for @j, recursing as needed.
2125
 * @j Minijail these mounts are for
2126
 * @m Head of list of mounts
2127
 *
2128
 * Returns 0 for success.
2129
 */
2130
static int mount_one(const struct minijail *j, struct mountpoint *m,
2131
         const char *dev_path)
2132
0
{
2133
0
  int ret;
2134
0
  char *dest;
2135
0
  bool do_remount = false;
2136
0
  bool has_bind_flag = mount_has_bind_flag(m);
2137
0
  bool has_remount_flag = !!(m->flags & MS_REMOUNT);
2138
0
  unsigned long original_mnt_flags = 0;
2139
2140
  /* We assume |dest| has a leading "/". */
2141
0
  if (dev_path && strncmp("/dev/", m->dest, 5) == 0) {
2142
    /*
2143
     * Since the temp path is rooted at /dev, skip that dest part.
2144
     */
2145
0
    if (asprintf(&dest, "%s%s", dev_path, m->dest + 4) < 0)
2146
0
      return -ENOMEM;
2147
0
  } else {
2148
0
    if (asprintf(&dest, "%s%s", j->chrootdir ?: "", m->dest) < 0)
2149
0
      return -ENOMEM;
2150
0
  }
2151
2152
0
  ret = setup_mount_destination(m->src, dest, j->uid, j->gid,
2153
0
              has_bind_flag);
2154
0
  if (ret) {
2155
0
    warn("cannot create mount target '%s'", dest);
2156
0
    goto error;
2157
0
  }
2158
2159
  /*
2160
   * Remount bind mounts that:
2161
   * - Come from the minijail_bind() API, and
2162
   * - Add the 'ro' flag
2163
   * since 'bind' and other flags can't both be specified in the same
2164
   * mount(2) call.
2165
   * Callers using minijail_mount() to perform bind mounts are expected to
2166
   * know what they're doing and call minijail_mount() with MS_REMOUNT as
2167
   * needed.
2168
   * Therefore, if the caller is asking for a remount (using MS_REMOUNT),
2169
   * there is no need to do an extra remount here.
2170
   */
2171
0
  if (has_bind_flag && strcmp(m->type, "minijail_bind") == 0 &&
2172
0
      !has_remount_flag) {
2173
    /*
2174
     * Grab the mount flags of the source. These are used to figure
2175
     * out whether the bind mount needs to be remounted read-only.
2176
     */
2177
0
    if (get_mount_flags(m->src, &original_mnt_flags)) {
2178
0
      warn("cannot get mount flags for '%s'", m->src);
2179
0
      goto error;
2180
0
    }
2181
2182
0
    if ((m->flags & MS_RDONLY) !=
2183
0
        (original_mnt_flags & MS_RDONLY)) {
2184
0
      do_remount = 1;
2185
      /*
2186
       * Restrict the mount flags to those that are
2187
       * user-settable in a MS_REMOUNT request, but excluding
2188
       * MS_RDONLY. The user-requested mount flags will
2189
       * dictate whether the remount will have that flag or
2190
       * not.
2191
       */
2192
0
      original_mnt_flags &=
2193
0
          (MS_USER_SETTABLE_MASK & ~MS_RDONLY);
2194
0
    }
2195
0
  }
2196
2197
  /*
2198
   * Do a final check for symlinks in |m->src|.
2199
   * |m->src| will only contain a valid path when purely bind-mounting
2200
   * (but not when remounting a bind mount).
2201
   *
2202
   * Short of having a version of mount(2) that can take fd's, this is the
2203
   * smallest we can make the TOCTOU window.
2204
   */
2205
0
  if (has_bind_flag && !has_remount_flag && !is_valid_bind_path(m->src)) {
2206
0
    warn("src '%s' is not a valid bind mount path", m->src);
2207
0
    goto error;
2208
0
  }
2209
2210
0
  ret = mount(m->src, dest, m->type, m->flags, m->data);
2211
0
  if (ret) {
2212
0
    pwarn("cannot mount '%s' as '%s' with flags %#lx", m->src, dest,
2213
0
          m->flags);
2214
0
    goto error;
2215
0
  }
2216
2217
  /* Remount *after* the initial mount. */
2218
0
  if (do_remount) {
2219
0
    ret =
2220
0
        mount(m->src, dest, NULL,
2221
0
        m->flags | original_mnt_flags | MS_REMOUNT, m->data);
2222
0
    if (ret) {
2223
0
      pwarn(
2224
0
          "cannot bind-remount '%s' as '%s' with flags %#lx",
2225
0
          m->src, dest,
2226
0
          m->flags | original_mnt_flags | MS_REMOUNT);
2227
0
      goto error;
2228
0
    }
2229
0
  }
2230
2231
0
  free(dest);
2232
0
  if (m->next)
2233
0
    return mount_one(j, m->next, dev_path);
2234
0
  return 0;
2235
2236
0
error:
2237
0
  free(dest);
2238
0
  return ret;
2239
0
}
2240
2241
static void process_mounts_or_die(const struct minijail *j)
2242
0
{
2243
  /*
2244
   * We have to mount /dev first in case there are bind mounts from
2245
   * the original /dev into the new unique tmpfs one.
2246
   */
2247
0
  char *dev_path = NULL;
2248
0
  if (j->flags.mount_dev && mount_dev(&dev_path))
2249
0
    pdie("mount_dev failed");
2250
2251
0
  if (j->mounts_head && mount_one(j, j->mounts_head, dev_path)) {
2252
0
    warn("mount_one failed with /dev at '%s'", dev_path);
2253
2254
0
    if (dev_path)
2255
0
      mount_dev_cleanup(dev_path);
2256
2257
0
    _exit(MINIJAIL_ERR_MOUNT);
2258
0
  }
2259
2260
  /*
2261
   * Once all bind mounts have been processed, move the temp dev to
2262
   * its final /dev home.
2263
   */
2264
0
  if (j->flags.mount_dev && mount_dev_finalize(j, dev_path))
2265
0
    pdie("mount_dev_finalize failed");
2266
0
}
2267
2268
static int enter_chroot(const struct minijail *j)
2269
0
{
2270
0
  run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
2271
2272
0
  if (chroot(j->chrootdir))
2273
0
    return -errno;
2274
2275
0
  if (chdir("/"))
2276
0
    return -errno;
2277
2278
0
  return 0;
2279
0
}
2280
2281
static int enter_pivot_root(const struct minijail *j)
2282
0
{
2283
0
  attribute_cleanup_fd int oldroot = -1;
2284
0
  attribute_cleanup_fd int newroot = -1;
2285
2286
0
  run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
2287
2288
  /*
2289
   * Keep the fd for both old and new root.
2290
   * It will be used in fchdir(2) later.
2291
   */
2292
0
  oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2293
0
  if (oldroot < 0)
2294
0
    pdie("failed to open / for fchdir");
2295
0
  newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2296
0
  if (newroot < 0)
2297
0
    pdie("failed to open %s for fchdir", j->chrootdir);
2298
2299
  /*
2300
   * To ensure j->chrootdir is the root of a filesystem,
2301
   * do a self bind mount.
2302
   */
2303
0
  if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
2304
0
    pdie("failed to bind mount '%s'", j->chrootdir);
2305
0
  if (chdir(j->chrootdir))
2306
0
    return -errno;
2307
0
  if (syscall(SYS_pivot_root, ".", "."))
2308
0
    pdie("pivot_root");
2309
2310
  /*
2311
   * Now the old root is mounted on top of the new root. Use fchdir(2) to
2312
   * change to the old root and unmount it.
2313
   */
2314
0
  if (fchdir(oldroot))
2315
0
    pdie("failed to fchdir to old /");
2316
2317
  /*
2318
   * If skip_remount_private was enabled for minijail_enter(),
2319
   * there could be a shared mount point under |oldroot|. In that case,
2320
   * mounts under this shared mount point will be unmounted below, and
2321
   * this unmounting will propagate to the original mount namespace
2322
   * (because the mount point is shared). To prevent this unexpected
2323
   * unmounting, remove these mounts from their peer groups by recursively
2324
   * remounting them as MS_PRIVATE.
2325
   */
2326
0
  if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
2327
0
    pdie("failed to mount(/, private) before umount(/)");
2328
  /* The old root might be busy, so use lazy unmount. */
2329
0
  if (umount2(".", MNT_DETACH))
2330
0
    pdie("umount(/)");
2331
  /* Change back to the new root. */
2332
0
  if (fchdir(newroot))
2333
0
    return -errno;
2334
0
  if (chroot("/"))
2335
0
    return -errno;
2336
  /* Set correct CWD for getcwd(3). */
2337
0
  if (chdir("/"))
2338
0
    return -errno;
2339
2340
0
  return 0;
2341
0
}
2342
2343
static int mount_tmp(const struct minijail *j)
2344
0
{
2345
0
  const char fmt[] = "size=%zu,mode=1777";
2346
  /* Count for the user storing ULLONG_MAX literally + extra space. */
2347
0
  char data[sizeof(fmt) + sizeof("18446744073709551615ULL")];
2348
0
  int ret;
2349
2350
0
  ret = snprintf(data, sizeof(data), fmt, j->tmpfs_size);
2351
2352
0
  if (ret <= 0)
2353
0
    pdie("tmpfs size spec error");
2354
0
  else if ((size_t)ret >= sizeof(data))
2355
0
    pdie("tmpfs size spec too large");
2356
2357
0
  unsigned long flags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
2358
2359
0
  if (block_symlinks_in_noninit_mountns_tmp()) {
2360
0
    flags |= MS_NOSYMFOLLOW;
2361
0
  }
2362
2363
0
  return mount("none", "/tmp", "tmpfs", flags, data);
2364
0
}
2365
2366
static int remount_proc_readonly(const struct minijail *j)
2367
0
{
2368
0
  const char *kProcPath = "/proc";
2369
0
  const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
2370
  /*
2371
   * Right now, we're holding a reference to our parent's old mount of
2372
   * /proc in our namespace, which means using MS_REMOUNT here would
2373
   * mutate our parent's mount as well, even though we're in a VFS
2374
   * namespace (!). Instead, remove their mount from our namespace lazily
2375
   * (MNT_DETACH) and make our own.
2376
   *
2377
   * However, we skip this in the user namespace case because it will
2378
   * invariably fail. Every mount namespace is "owned" by the
2379
   * user namespace of the process that creates it. Mount namespace A is
2380
   * "less privileged" than mount namespace B if A is created off of B,
2381
   * and B is owned by a different user namespace.
2382
   * When a less privileged mount namespace is created, the mounts used to
2383
   * initialize it (coming from the more privileged mount namespace) come
2384
   * as a unit, and are locked together. This means that code running in
2385
   * the new mount (and user) namespace cannot piecemeal unmount
2386
   * individual mounts inherited from a more privileged mount namespace.
2387
   * See https://man7.org/linux/man-pages/man7/mount_namespaces.7.html,
2388
   * "Restrictions on mount namespaces" for details.
2389
   *
2390
   * This happens in our use case because we first enter a new user
2391
   * namespace (on clone(2)) and then we unshare(2) a new mount namespace,
2392
   * which means the new mount namespace is less privileged than its
2393
   * parent mount namespace. This would also happen if we entered a new
2394
   * mount namespace on clone(2), since the user namespace is created
2395
   * first.
2396
   * In all other non-user-namespace cases the new mount namespace is
2397
   * similarly privileged as the parent mount namespace so unmounting a
2398
   * single mount is allowed.
2399
   *
2400
   * We still remount /proc as read-only in the user namespace case
2401
   * because while a process with CAP_SYS_ADMIN in the new user namespace
2402
   * can unmount the RO mount and get at the RW mount, an attacker with
2403
   * access only to a write primitive will not be able to modify /proc.
2404
   */
2405
0
  if (!j->flags.userns && umount2(kProcPath, MNT_DETACH))
2406
0
    return -errno;
2407
0
  if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
2408
0
    return -errno;
2409
0
  return 0;
2410
0
}
2411
2412
static void kill_child_and_die(const struct minijail *j, const char *msg)
2413
0
{
2414
0
  kill(j->initpid, SIGKILL);
2415
0
  die("%s", msg);
2416
0
}
2417
2418
static void write_pid_file_or_die(const struct minijail *j)
2419
0
{
2420
0
  if (write_pid_to_path(j->initpid, j->pid_file_path))
2421
0
    kill_child_and_die(j, "failed to write pid file");
2422
0
}
2423
2424
static void add_to_cgroups_or_die(const struct minijail *j)
2425
0
{
2426
0
  size_t i;
2427
2428
0
  for (i = 0; i < j->cgroup_count; ++i) {
2429
0
    if (write_pid_to_path(j->initpid, j->cgroups[i]))
2430
0
      kill_child_and_die(j, "failed to add to cgroups");
2431
0
  }
2432
0
}
2433
2434
static void set_rlimits_or_die(const struct minijail *j)
2435
0
{
2436
0
  size_t i;
2437
2438
0
  for (i = 0; i < j->rlimit_count; ++i) {
2439
0
    struct rlimit limit;
2440
0
    limit.rlim_cur = j->rlimits[i].cur;
2441
0
    limit.rlim_max = j->rlimits[i].max;
2442
0
    if (prlimit(j->initpid, j->rlimits[i].type, &limit, NULL))
2443
0
      kill_child_and_die(j, "failed to set rlimit");
2444
0
  }
2445
0
}
2446
2447
static void write_ugid_maps_or_die(const struct minijail *j)
2448
0
{
2449
0
  if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0)
2450
0
    kill_child_and_die(j, "failed to write uid_map");
2451
0
  if (j->gidmap && j->flags.disable_setgroups) {
2452
    /*
2453
     * Older kernels might not have the /proc/<pid>/setgroups files.
2454
     */
2455
0
    int ret = write_proc_file(j->initpid, "deny", "setgroups");
2456
0
    if (ret != 0) {
2457
0
      if (ret == -ENOENT) {
2458
        /*
2459
         * See
2460
         * http://man7.org/linux/man-pages/man7/user_namespaces.7.html.
2461
         */
2462
0
        warn("could not disable setgroups(2)");
2463
0
      } else
2464
0
        kill_child_and_die(
2465
0
            j, "failed to disable setgroups(2)");
2466
0
    }
2467
0
  }
2468
0
  if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0)
2469
0
    kill_child_and_die(j, "failed to write gid_map");
2470
0
}
2471
2472
static void enter_user_namespace(const struct minijail *j)
2473
0
{
2474
0
  int uid = j->flags.uid ? j->uid : 0;
2475
0
  int gid = j->flags.gid ? j->gid : 0;
2476
0
  if (j->gidmap && setresgid(gid, gid, gid)) {
2477
0
    pdie("user_namespaces: setresgid(%d, %d, %d) failed", gid, gid,
2478
0
         gid);
2479
0
  }
2480
0
  if (j->uidmap && setresuid(uid, uid, uid)) {
2481
0
    pdie("user_namespaces: setresuid(%d, %d, %d) failed", uid, uid,
2482
0
         uid);
2483
0
  }
2484
0
}
2485
2486
static void parent_setup_complete(int *pipe_fds)
2487
0
{
2488
0
  close_and_reset(&pipe_fds[0]);
2489
0
  close_and_reset(&pipe_fds[1]);
2490
0
}
2491
2492
/*
2493
 * wait_for_parent_setup: Called by the child process to wait for any
2494
 * further parent-side setup to complete before continuing.
2495
 */
2496
static void wait_for_parent_setup(int *pipe_fds)
2497
0
{
2498
0
  char buf;
2499
2500
0
  close_and_reset(&pipe_fds[1]);
2501
2502
  /* Wait for parent to complete setup and close the pipe. */
2503
0
  if (read(pipe_fds[0], &buf, 1) != 0)
2504
0
    die("failed to sync with parent");
2505
0
  close_and_reset(&pipe_fds[0]);
2506
0
}
2507
2508
static void drop_ugid(const struct minijail *j)
2509
0
{
2510
0
  if (j->flags.inherit_suppl_gids + j->flags.keep_suppl_gids +
2511
0
    j->flags.set_suppl_gids >
2512
0
      1) {
2513
0
    die("can only do one of inherit, keep, or set supplementary "
2514
0
        "groups");
2515
0
  }
2516
2517
0
  if (j->flags.inherit_suppl_gids) {
2518
0
    if (initgroups(j->user, j->usergid))
2519
0
      pdie("initgroups(%s, %d) failed", j->user, j->usergid);
2520
0
  } else if (j->flags.set_suppl_gids) {
2521
0
    if (setgroups(j->suppl_gid_count, j->suppl_gid_list))
2522
0
      pdie("setgroups(suppl_gids) failed");
2523
0
  } else if (!j->flags.keep_suppl_gids && !j->flags.disable_setgroups) {
2524
    /*
2525
     * Only attempt to clear supplementary groups if we are changing
2526
     * users or groups, and if the caller did not request to disable
2527
     * setgroups (used when entering a user namespace as a
2528
     * non-privileged user).
2529
     */
2530
0
    if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL))
2531
0
      pdie("setgroups(0, NULL) failed");
2532
0
  }
2533
2534
0
  if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
2535
0
    pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid);
2536
2537
0
  if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
2538
0
    pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid);
2539
0
}
2540
2541
static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
2542
0
{
2543
0
  const uint64_t one = 1;
2544
0
  unsigned int i;
2545
0
  for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
2546
0
    if (keep_mask & (one << i))
2547
0
      continue;
2548
0
    if (prctl(PR_CAPBSET_DROP, i))
2549
0
      pdie("could not drop capability from bounding set");
2550
0
  }
2551
0
}
2552
2553
static void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
2554
0
{
2555
0
  if (!j->flags.use_caps)
2556
0
    return;
2557
2558
0
  cap_t caps = cap_get_proc();
2559
0
  cap_value_t flag[1];
2560
0
  const size_t ncaps = sizeof(j->caps) * 8;
2561
0
  const uint64_t one = 1;
2562
0
  unsigned int i;
2563
0
  if (!caps)
2564
0
    die("can't get process caps");
2565
0
  if (cap_clear(caps))
2566
0
    die("can't clear caps");
2567
2568
0
  for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
2569
    /* Keep CAP_SETPCAP for dropping bounding set bits. */
2570
0
    if (i != CAP_SETPCAP && !(j->caps & (one << i)))
2571
0
      continue;
2572
0
    flag[0] = i;
2573
0
    if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
2574
0
      die("can't add effective cap");
2575
0
    if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
2576
0
      die("can't add permitted cap");
2577
0
    if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
2578
0
      die("can't add inheritable cap");
2579
0
  }
2580
0
  if (cap_set_proc(caps))
2581
0
    die("can't apply initial cleaned capset");
2582
2583
  /*
2584
   * Instead of dropping the bounding set first, do it here in case
2585
   * the caller had a more permissive bounding set which could
2586
   * have been used above to raise a capability that wasn't already
2587
   * present. This requires CAP_SETPCAP, so we raised/kept it above.
2588
   *
2589
   * However, if we're asked to skip setting *and* locking the
2590
   * SECURE_NOROOT securebit, also skip dropping the bounding set.
2591
   * If the caller wants to regain all capabilities when executing a
2592
   * set-user-ID-root program, allow them to do so. The default behavior
2593
   * (i.e. the behavior without |securebits_skip_mask| set) will still put
2594
   * the jailed process tree in a capabilities-only environment.
2595
   *
2596
   * We check the negated skip mask for SECURE_NOROOT and
2597
   * SECURE_NOROOT_LOCKED. If the bits are set in the negated mask they
2598
   * will *not* be skipped in lock_securebits(), and therefore we should
2599
   * drop the bounding set.
2600
   */
2601
0
  if (secure_noroot_set_and_locked(~j->securebits_skip_mask)) {
2602
0
    drop_capbset(j->caps, last_valid_cap);
2603
0
  } else {
2604
0
    warn("SECURE_NOROOT not set, not dropping bounding set");
2605
0
  }
2606
2607
  /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
2608
0
  if ((j->caps & (one << CAP_SETPCAP)) == 0) {
2609
0
    flag[0] = CAP_SETPCAP;
2610
0
    if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
2611
0
      die("can't clear effective cap");
2612
0
    if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
2613
0
      die("can't clear permitted cap");
2614
0
    if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
2615
0
      die("can't clear inheritable cap");
2616
0
  }
2617
2618
0
  if (cap_set_proc(caps))
2619
0
    die("can't apply final cleaned capset");
2620
2621
  /*
2622
   * If ambient capabilities are supported, clear all capabilities first,
2623
   * then raise the requested ones.
2624
   */
2625
0
  if (j->flags.set_ambient_caps) {
2626
0
    if (!cap_ambient_supported()) {
2627
0
      pdie("ambient capabilities not supported");
2628
0
    }
2629
0
    if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) !=
2630
0
        0) {
2631
0
      pdie("can't clear ambient capabilities");
2632
0
    }
2633
2634
0
    for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
2635
0
      if (!(j->caps & (one << i)))
2636
0
        continue;
2637
2638
0
      if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0,
2639
0
          0) != 0) {
2640
0
        pdie("prctl(PR_CAP_AMBIENT, "
2641
0
             "PR_CAP_AMBIENT_RAISE, %u) failed",
2642
0
             i);
2643
0
      }
2644
0
    }
2645
0
  }
2646
2647
0
  cap_free(caps);
2648
0
}
2649
2650
/* Calls landlock_restrict_self(), based on current inodes. */
2651
static void apply_landlock_restrictions(const struct minijail *j)
2652
0
{
2653
0
  struct fs_rule *r = j->fs_rules_head;
2654
  /* The ruleset_fd needs to be mutable so use a stack copy from now on.
2655
   */
2656
0
  int ruleset_fd = j->fs_rules_fd;
2657
0
  if (!j->flags.enable_fs_restrictions || !r) {
2658
0
    return;
2659
0
  }
2660
2661
0
  if (minijail_is_fs_restriction_available()) {
2662
0
    while (r) {
2663
0
      populate_ruleset_internal(r->path, ruleset_fd,
2664
0
              r->landlock_flags);
2665
0
      r = r->next;
2666
0
    }
2667
0
  }
2668
2669
0
  if (ruleset_fd >= 0) {
2670
0
    if (j->filename != NULL) {
2671
0
      info("applying Landlock to process %s", j->filename);
2672
0
    }
2673
0
    if (landlock_restrict_self(ruleset_fd, 0)) {
2674
0
      pdie("failed to enforce ruleset");
2675
0
    }
2676
0
    close(ruleset_fd);
2677
0
  }
2678
0
}
2679
2680
static void set_no_new_privs(const struct minijail *j)
2681
0
{
2682
0
  if (j->flags.no_new_privs) {
2683
0
    if (!sys_set_no_new_privs()) {
2684
0
      die("set_no_new_privs() failed");
2685
0
    }
2686
0
  }
2687
0
}
2688
2689
static void set_seccomp_filter(const struct minijail *j)
2690
0
{
2691
  /*
2692
   * Code running with ASan
2693
   * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
2694
   * will make system calls not included in the syscall filter policy,
2695
   * which will likely crash the program. Skip setting seccomp filter in
2696
   * that case.
2697
   * 'running_with_asan()' has no inputs and is completely defined at
2698
   * build time, so this cannot be used by an attacker to skip setting
2699
   * seccomp filter.
2700
   */
2701
0
  if (j->flags.seccomp_filter && running_with_asan()) {
2702
0
    warn("running with (HW)ASan, not setting seccomp filter");
2703
0
    return;
2704
0
  }
2705
2706
0
  if (j->flags.seccomp_filter) {
2707
0
    if (seccomp_is_logging_allowed(j)) {
2708
0
      warn("logging seccomp filter failures");
2709
0
      if (!seccomp_ret_log_available()) {
2710
        /*
2711
         * If SECCOMP_RET_LOG is not available,
2712
         * install the SIGSYS handler first.
2713
         */
2714
0
        if (install_sigsys_handler())
2715
0
          pdie(
2716
0
              "failed to install SIGSYS handler");
2717
0
      }
2718
0
    } else if (j->flags.seccomp_filter_tsync) {
2719
      /*
2720
       * If setting thread sync,
2721
       * reset the SIGSYS signal handler so that
2722
       * the entire thread group is killed.
2723
       */
2724
0
      if (signal(SIGSYS, SIG_DFL) == SIG_ERR)
2725
0
        pdie("failed to reset SIGSYS disposition");
2726
0
    }
2727
0
  }
2728
2729
  /*
2730
   * Install the syscall filter.
2731
   */
2732
0
  if (j->flags.seccomp_filter) {
2733
0
    if (j->flags.seccomp_filter_tsync ||
2734
0
        j->flags.seccomp_filter_allow_speculation) {
2735
0
      int filter_flags =
2736
0
          (j->flags.seccomp_filter_tsync
2737
0
         ? SECCOMP_FILTER_FLAG_TSYNC
2738
0
         : 0) |
2739
0
          (j->flags.seccomp_filter_allow_speculation
2740
0
         ? SECCOMP_FILTER_FLAG_SPEC_ALLOW
2741
0
         : 0);
2742
0
      if (sys_seccomp(SECCOMP_SET_MODE_FILTER, filter_flags,
2743
0
          j->filter_prog)) {
2744
0
        pdie("seccomp(tsync) failed");
2745
0
      }
2746
0
    } else {
2747
0
      if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2748
0
          j->filter_prog)) {
2749
0
        pdie("prctl(seccomp_filter) failed");
2750
0
      }
2751
0
    }
2752
0
  }
2753
0
}
2754
2755
static pid_t forward_pid = -1;
2756
2757
static void forward_signal(int sig, siginfo_t *siginfo attribute_unused,
2758
         void *void_context attribute_unused)
2759
0
{
2760
0
  if (forward_pid != -1) {
2761
0
    kill(forward_pid, sig);
2762
0
  }
2763
0
}
2764
2765
static void install_signal_handlers(void)
2766
0
{
2767
0
  struct sigaction act;
2768
2769
0
  memset(&act, 0, sizeof(act));
2770
0
  act.sa_sigaction = &forward_signal;
2771
0
  act.sa_flags = SA_SIGINFO | SA_RESTART;
2772
2773
  /* Handle all signals, except SIGCHLD. */
2774
0
  for (int sig = 1; sig < NSIG; sig++) {
2775
    /*
2776
     * We don't care if we get EINVAL: that just means that we
2777
     * can't handle this signal, so let's skip it and continue.
2778
     */
2779
0
    sigaction(sig, &act, NULL);
2780
0
  }
2781
  /* Reset SIGCHLD's handler. */
2782
0
  signal(SIGCHLD, SIG_DFL);
2783
2784
  /* Handle real-time signals. */
2785
0
  for (int sig = SIGRTMIN; sig <= SIGRTMAX; sig++) {
2786
0
    sigaction(sig, &act, NULL);
2787
0
  }
2788
0
}
2789
2790
static const char *lookup_hook_name(minijail_hook_event_t event)
2791
0
{
2792
0
  switch (event) {
2793
0
  case MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS:
2794
0
    return "pre-drop-caps";
2795
0
  case MINIJAIL_HOOK_EVENT_PRE_EXECVE:
2796
0
    return "pre-execve";
2797
0
  case MINIJAIL_HOOK_EVENT_PRE_CHROOT:
2798
0
    return "pre-chroot";
2799
0
  case MINIJAIL_HOOK_EVENT_MAX:
2800
    /*
2801
     * Adding this in favor of a default case to force the
2802
     * compiler to error out if a new enum value is added.
2803
     */
2804
0
    break;
2805
0
  }
2806
0
  return "unknown";
2807
0
}
2808
2809
static void run_hooks_or_die(const struct minijail *j,
2810
           minijail_hook_event_t event)
2811
0
{
2812
0
  int rc;
2813
0
  int hook_index = 0;
2814
0
  for (struct hook *c = j->hooks_head; c; c = c->next) {
2815
0
    if (c->event != event)
2816
0
      continue;
2817
0
    rc = c->hook(c->payload);
2818
0
    if (rc != 0) {
2819
0
      errno = -rc;
2820
0
      pdie("%s hook (index %d) failed",
2821
0
           lookup_hook_name(event), hook_index);
2822
0
    }
2823
    /* Only increase the index within the same hook event type. */
2824
0
    ++hook_index;
2825
0
  }
2826
0
}
2827
2828
void API minijail_enter(const struct minijail *j)
2829
0
{
2830
  /*
2831
   * If we're dropping caps, get the last valid cap from /proc now,
2832
   * since /proc can be unmounted before drop_caps() is called.
2833
   */
2834
0
  unsigned int last_valid_cap = 0;
2835
0
  if (j->flags.capbset_drop || j->flags.use_caps)
2836
0
    last_valid_cap = get_last_valid_cap();
2837
2838
0
  if (j->flags.pids)
2839
0
    die("tried to enter a pid-namespaced jail;"
2840
0
        " try minijail_run()?");
2841
2842
0
  if (j->flags.inherit_suppl_gids && !j->user)
2843
0
    die("cannot inherit supplementary groups without setting a "
2844
0
        "username");
2845
2846
  /*
2847
   * We can't recover from failures if we've dropped privileges partially,
2848
   * so we don't even try. If any of our operations fail, we abort() the
2849
   * entire process.
2850
   */
2851
0
  if (j->flags.enter_vfs) {
2852
0
    if (setns(j->mountns_fd, CLONE_NEWNS))
2853
0
      pdie("setns(CLONE_NEWNS) failed");
2854
0
    close(j->mountns_fd);
2855
0
  }
2856
2857
0
  if (j->flags.vfs) {
2858
0
    if (unshare(CLONE_NEWNS))
2859
0
      pdie("unshare(CLONE_NEWNS) failed");
2860
    /*
2861
     * By default, remount all filesystems as private, unless
2862
     * - Passed a specific remount mode, in which case remount with
2863
     *   that,
2864
     * - Asked not to remount at all, in which case skip the
2865
     *   mount(2) call.
2866
     * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
2867
     */
2868
0
    if (j->remount_mode) {
2869
0
      if (mount(NULL, "/", NULL, MS_REC | j->remount_mode,
2870
0
          NULL))
2871
0
        pdie("mount(NULL, /, NULL, "
2872
0
             "MS_REC | j->remount_mode, NULL) failed");
2873
2874
0
      struct minijail_remount *temp = j->remounts_head;
2875
0
      while (temp) {
2876
0
        if (temp->remount_mode < j->remount_mode)
2877
0
          die("cannot remount %s as stricter "
2878
0
              "than the root dir",
2879
0
              temp->mount_name);
2880
0
        if (mount(NULL, temp->mount_name, NULL,
2881
0
            MS_REC | temp->remount_mode, NULL))
2882
0
          pdie("mount(NULL, %s, NULL, "
2883
0
               "MS_REC | temp->remount_mode, "
2884
0
               "NULL) failed",
2885
0
               temp->mount_name);
2886
0
        temp = temp->next;
2887
0
      }
2888
0
    }
2889
0
  }
2890
2891
0
  if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
2892
0
    pdie("unshare(CLONE_NEWIPC) failed");
2893
0
  }
2894
2895
0
  if (j->flags.uts) {
2896
0
    if (unshare(CLONE_NEWUTS))
2897
0
      pdie("unshare(CLONE_NEWUTS) failed");
2898
2899
0
    if (j->hostname &&
2900
0
        sethostname(j->hostname, strlen(j->hostname)))
2901
0
      pdie("sethostname(%s) failed", j->hostname);
2902
0
  }
2903
2904
0
  if (j->flags.enter_net) {
2905
0
    if (setns(j->netns_fd, CLONE_NEWNET))
2906
0
      pdie("setns(CLONE_NEWNET) failed");
2907
0
    close(j->netns_fd);
2908
0
  } else if (j->flags.net) {
2909
0
    if (unshare(CLONE_NEWNET))
2910
0
      pdie("unshare(CLONE_NEWNET) failed");
2911
0
    if (j->flags.net_loopback)
2912
0
      config_net_loopback();
2913
0
  }
2914
2915
0
  if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP))
2916
0
    pdie("unshare(CLONE_NEWCGROUP) failed");
2917
2918
0
  if (j->flags.new_session_keyring) {
2919
0
    if (syscall(SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL) < 0)
2920
0
      pdie("keyctl(KEYCTL_JOIN_SESSION_KEYRING) failed");
2921
0
  }
2922
2923
  /* We have to process all the mounts before we chroot/pivot_root. */
2924
0
  process_mounts_or_die(j);
2925
2926
0
  if (j->flags.chroot && enter_chroot(j))
2927
0
    pdie("chroot");
2928
2929
0
  if (j->flags.pivot_root && enter_pivot_root(j))
2930
0
    pdie("pivot_root");
2931
2932
0
  if (j->flags.mount_tmp && mount_tmp(j))
2933
0
    pdie("mount_tmp");
2934
2935
0
  if (j->flags.remount_proc_ro && remount_proc_readonly(j))
2936
0
    pdie("remount");
2937
2938
0
  run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS);
2939
2940
  /*
2941
   * If we're only dropping capabilities from the bounding set, but not
2942
   * from the thread's (permitted|inheritable|effective) sets, do it now.
2943
   */
2944
0
  if (j->flags.capbset_drop) {
2945
0
    drop_capbset(j->cap_bset, last_valid_cap);
2946
0
  }
2947
2948
  /*
2949
   * POSIX capabilities are a bit tricky. We must set SECBIT_KEEP_CAPS
2950
   * before drop_ugid() below as the latter would otherwise drop all
2951
   * capabilities.
2952
   */
2953
0
  if (j->flags.use_caps) {
2954
    /*
2955
     * When using ambient capabilities, CAP_SET{GID,UID} can be
2956
     * inherited across execve(2), so SECBIT_KEEP_CAPS is not
2957
     * strictly needed.
2958
     */
2959
0
    bool require_keep_caps = !j->flags.set_ambient_caps;
2960
0
    if (lock_securebits(j->securebits_skip_mask,
2961
0
            require_keep_caps) < 0) {
2962
0
      pdie("locking securebits failed");
2963
0
    }
2964
0
  }
2965
2966
0
  if (j->flags.no_new_privs) {
2967
    /*
2968
     * If we're setting no_new_privs, we can drop privileges
2969
     * before setting seccomp filter. This way filter policies
2970
     * don't need to allow privilege-dropping syscalls.
2971
     */
2972
0
    drop_ugid(j);
2973
0
    drop_caps(j, last_valid_cap);
2974
2975
    /*
2976
     * Landlock is applied as late as possible. If no_new_privs is
2977
     * requested, then we need to set that first because the
2978
     * landlock_restrict_self() syscall has a seccomp(2) like check
2979
     * for that. See:
2980
     * https://elixir.bootlin.com/linux/v5.15.74/source/security/landlock/syscalls.c#L409
2981
     */
2982
0
    set_no_new_privs(j);
2983
0
    apply_landlock_restrictions(j);
2984
0
    set_seccomp_filter(j);
2985
0
  } else {
2986
0
    apply_landlock_restrictions(j);
2987
2988
    /*
2989
     * If we're not setting no_new_privs,
2990
     * we need to set seccomp filter *before* dropping privileges.
2991
     * WARNING: this means that filter policies *must* allow
2992
     * setgroups()/setresgid()/setresuid() for dropping root and
2993
     * capget()/capset()/prctl() for dropping caps.
2994
     */
2995
0
    set_seccomp_filter(j);
2996
0
    drop_ugid(j);
2997
0
    drop_caps(j, last_valid_cap);
2998
0
  }
2999
3000
  /*
3001
   * Select the specified alternate syscall table.  The table must not
3002
   * block prctl(2) if we're using seccomp as well.
3003
   */
3004
0
  if (j->flags.alt_syscall) {
3005
0
    if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
3006
0
      pdie("prctl(PR_ALT_SYSCALL) failed");
3007
0
  }
3008
3009
  /*
3010
   * seccomp has to come last since it cuts off all the other
3011
   * privilege-dropping syscalls :)
3012
   */
3013
0
  if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
3014
0
    if ((errno == EINVAL) && seccomp_can_softfail()) {
3015
0
      warn("seccomp not supported");
3016
0
      return;
3017
0
    }
3018
0
    pdie("prctl(PR_SET_SECCOMP) failed");
3019
0
  }
3020
0
}
3021
3022
/* TODO(wad): will visibility affect this variable? */
3023
static int init_exitstatus = 0;
3024
3025
static void init_term(int sig attribute_unused)
3026
0
{
3027
0
  _exit(init_exitstatus);
3028
0
}
3029
3030
static void init(pid_t rootpid)
3031
0
{
3032
0
  pid_t pid;
3033
0
  int status;
3034
  /* So that we exit with the right status. */
3035
0
  signal(SIGTERM, init_term);
3036
  /* TODO(wad): self jail with seccomp filters here. */
3037
0
  while ((pid = wait(&status)) > 0) {
3038
    /*
3039
     * This loop will only end when either there are no processes
3040
     * left inside our pid namespace or we get a signal.
3041
     */
3042
0
    if (pid == rootpid)
3043
0
      init_exitstatus = status;
3044
0
  }
3045
0
  if (!WIFEXITED(init_exitstatus))
3046
0
    _exit(MINIJAIL_ERR_INIT);
3047
0
  _exit(WEXITSTATUS(init_exitstatus));
3048
0
}
3049
3050
int API minijail_from_fd(int fd, struct minijail *j)
3051
0
{
3052
0
  size_t sz = 0;
3053
0
  int err = read_exactly(fd, &sz, sizeof(sz));
3054
0
  attribute_cleanup_str char *buf = NULL;
3055
0
  if (err) {
3056
0
    pwarn("failed to read marshalled minijail size");
3057
0
    return err;
3058
0
  }
3059
0
  if (sz > USHRT_MAX) /* arbitrary check */
3060
0
    return -E2BIG;
3061
0
  buf = malloc(sz);
3062
0
  if (!buf)
3063
0
    return -ENOMEM;
3064
0
  err = read_exactly(fd, buf, sz);
3065
0
  if (err) {
3066
0
    pwarn("failed to read marshalled minijail payload");
3067
0
    return err;
3068
0
  }
3069
0
  return minijail_unmarshal(j, buf, sz);
3070
0
}
3071
3072
int API minijail_to_fd(struct minijail *j, int fd)
3073
0
{
3074
0
  size_t sz = minijail_size(j);
3075
0
  if (!sz)
3076
0
    return -EINVAL;
3077
3078
0
  attribute_cleanup_str char *buf = malloc(sz);
3079
0
  if (!buf)
3080
0
    return -ENOMEM;
3081
3082
0
  int err = minijail_marshal(j, buf, sz);
3083
0
  if (err)
3084
0
    return err;
3085
3086
  /* Sends [size][minijail]. */
3087
0
  err = write_exactly(fd, &sz, sizeof(sz));
3088
0
  if (err)
3089
0
    return err;
3090
3091
0
  return write_exactly(fd, buf, sz);
3092
0
}
3093
3094
int API minijail_copy_jail(const struct minijail *from, struct minijail *out)
3095
0
{
3096
0
  size_t sz = minijail_size(from);
3097
0
  if (!sz)
3098
0
    return -EINVAL;
3099
3100
0
  attribute_cleanup_str char *buf = malloc(sz);
3101
0
  if (!buf)
3102
0
    return -ENOMEM;
3103
3104
0
  int err = minijail_marshal(from, buf, sz);
3105
0
  if (err)
3106
0
    return err;
3107
3108
0
  return minijail_unmarshal(out, buf, sz);
3109
0
}
3110
3111
static int setup_preload(const struct minijail *j attribute_unused,
3112
       char ***child_env attribute_unused)
3113
0
{
3114
#if defined(__ANDROID__)
3115
  /* Don't use LDPRELOAD on Android. */
3116
  return 0;
3117
#else
3118
0
  const char *preload_path = j->preload_path ?: PRELOADPATH;
3119
0
  char *newenv = NULL;
3120
0
  int ret = 0;
3121
0
  const char *oldenv = minijail_getenv(*child_env, kLdPreloadEnvVar);
3122
3123
0
  if (!oldenv)
3124
0
    oldenv = "";
3125
3126
  /* Only insert a separating space if we have something to separate... */
3127
0
  if (asprintf(&newenv, "%s%s%s", oldenv, oldenv[0] != '\0' ? " " : "",
3128
0
         preload_path) < 0) {
3129
0
    return -1;
3130
0
  }
3131
3132
0
  ret = minijail_setenv(child_env, kLdPreloadEnvVar, newenv, 1);
3133
0
  free(newenv);
3134
0
  return ret;
3135
0
#endif
3136
0
}
3137
3138
/*
3139
 * This is for logging purposes and does not change the enforced seccomp
3140
 * filter.
3141
 */
3142
static int setup_seccomp_policy_path(const struct minijail *j,
3143
             char ***child_env)
3144
0
{
3145
0
  return minijail_setenv(child_env, kSeccompPolicyPathEnvVar,
3146
0
             j->seccomp_policy_path ? j->seccomp_policy_path
3147
0
                  : "NO-LABEL",
3148
0
             1 /* overwrite */);
3149
0
}
3150
3151
static int setup_pipe(char ***child_env, int fds[2])
3152
0
{
3153
0
  int r = pipe(fds);
3154
0
  char fd_buf[11];
3155
0
  if (r)
3156
0
    return r;
3157
0
  r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
3158
0
  if (r <= 0)
3159
0
    return -EINVAL;
3160
0
  return minijail_setenv(child_env, kFdEnvVar, fd_buf, 1);
3161
0
}
3162
3163
static int close_open_fds(int *inheritable_fds, size_t size)
3164
0
{
3165
0
  const char *kFdPath = "/proc/self/fd";
3166
3167
0
  DIR *d = opendir(kFdPath);
3168
0
  struct dirent *dir_entry;
3169
3170
0
  if (d == NULL)
3171
0
    return -1;
3172
0
  int dir_fd = dirfd(d);
3173
0
  while ((dir_entry = readdir(d)) != NULL) {
3174
0
    size_t i;
3175
0
    char *end;
3176
0
    bool should_close = true;
3177
0
    const int fd = strtol(dir_entry->d_name, &end, 10);
3178
3179
0
    if ((*end) != '\0') {
3180
0
      continue;
3181
0
    }
3182
    /*
3183
     * We might have set up some pipes that we want to share with
3184
     * the parent process, and should not be closed.
3185
     */
3186
0
    for (i = 0; i < size; ++i) {
3187
0
      if (fd == inheritable_fds[i]) {
3188
0
        should_close = false;
3189
0
        break;
3190
0
      }
3191
0
    }
3192
    /* Also avoid closing the directory fd. */
3193
0
    if (should_close && fd != dir_fd)
3194
0
      close(fd);
3195
0
  }
3196
0
  closedir(d);
3197
0
  return 0;
3198
0
}
3199
3200
/* Return true if the specified file descriptor is already open. */
3201
int minijail_fd_is_open(int fd)
3202
0
{
3203
0
  return fcntl(fd, F_GETFD) != -1 || errno != EBADF;
3204
0
}
3205
3206
/*
3207
 * Returns true if |check_fd| is one of j->preserved_fds[:max_index].child_fd.
3208
 */
3209
static bool is_preserved_child_fd(struct minijail *j, int check_fd,
3210
          size_t max_index)
3211
0
{
3212
0
  max_index = MIN(max_index, j->preserved_fd_count);
3213
0
  for (size_t i = 0; i < max_index; i++) {
3214
0
    if (j->preserved_fds[i].child_fd == check_fd) {
3215
0
      return true;
3216
0
    }
3217
0
  }
3218
0
  return false;
3219
0
}
3220
3221
/* If parent_fd will be used by a child fd, move it to an unused fd. */
3222
static int ensure_no_fd_conflict(struct minijail *j, int child_fd,
3223
         int *parent_fd, size_t max_index)
3224
0
{
3225
0
  if (!is_preserved_child_fd(j, *parent_fd, max_index)) {
3226
0
    return 0;
3227
0
  }
3228
3229
  /*
3230
   * If no other parent_fd matches the child_fd then use it instead of a
3231
   * temporary.
3232
   */
3233
0
  int fd = child_fd;
3234
0
  if (fd == -1 || minijail_fd_is_open(fd)) {
3235
0
    fd = 1023;
3236
0
    while (is_preserved_child_fd(j, fd, j->preserved_fd_count) ||
3237
0
           minijail_fd_is_open(fd)) {
3238
0
      --fd;
3239
0
      if (fd < 0) {
3240
0
        die("failed to find an unused fd");
3241
0
      }
3242
0
    }
3243
0
  }
3244
3245
0
  int ret = dup2(*parent_fd, fd);
3246
  /*
3247
   * warn() opens a file descriptor so it needs to happen after dup2 to
3248
   * avoid unintended side effects. This can be avoided by reordering the
3249
   * mapping requests so that the source fds with overlap are mapped
3250
   * first (unless there are cycles).
3251
   */
3252
0
  warn("mapped fd overlap: moving %d to %d", *parent_fd, fd);
3253
0
  if (ret == -1) {
3254
0
    return -1;
3255
0
  }
3256
3257
0
  *parent_fd = fd;
3258
0
  return 0;
3259
0
}
3260
3261
/*
3262
 * Check for contradictory mappings and create temporaries for parent file
3263
 * descriptors that would otherwise be overwritten during redirect_fds().
3264
 */
3265
static int prepare_preserved_fds(struct minijail *j)
3266
0
{
3267
  /* Relocate parent_fds that would be replaced by a child_fd. */
3268
0
  for (size_t i = 0; i < j->preserved_fd_count; i++) {
3269
0
    int child_fd = j->preserved_fds[i].child_fd;
3270
0
    if (is_preserved_child_fd(j, child_fd, i)) {
3271
0
      die("fd %d is mapped more than once", child_fd);
3272
0
    }
3273
3274
0
    int *parent_fd = &j->preserved_fds[i].parent_fd;
3275
0
    if (ensure_no_fd_conflict(j, child_fd, parent_fd, i) == -1) {
3276
0
      return -1;
3277
0
    }
3278
0
  }
3279
0
  return 0;
3280
0
}
3281
3282
/*
3283
 * Structure holding resources and state created when running a minijail.
3284
 */
3285
struct minijail_run_state {
3286
  pid_t child_pid;
3287
  int pipe_fds[2];
3288
  int stdin_fds[2];
3289
  int stdout_fds[2];
3290
  int stderr_fds[2];
3291
  int child_sync_pipe_fds[2];
3292
  char **child_env;
3293
};
3294
3295
/*
3296
 * Move pipe_fds if they conflict with a child_fd.
3297
 */
3298
static int avoid_pipe_conflicts(struct minijail *j,
3299
        struct minijail_run_state *state)
3300
0
{
3301
0
  int *pipe_fds[] = {
3302
0
      state->pipe_fds,   state->child_sync_pipe_fds, state->stdin_fds,
3303
0
      state->stdout_fds, state->stderr_fds,
3304
0
  };
3305
0
  for (size_t i = 0; i < ARRAY_SIZE(pipe_fds); ++i) {
3306
0
    if (pipe_fds[i][0] != -1 &&
3307
0
        ensure_no_fd_conflict(j, -1, &pipe_fds[i][0],
3308
0
            j->preserved_fd_count) == -1) {
3309
0
      return -1;
3310
0
    }
3311
0
    if (pipe_fds[i][1] != -1 &&
3312
0
        ensure_no_fd_conflict(j, -1, &pipe_fds[i][1],
3313
0
            j->preserved_fd_count) == -1) {
3314
0
      return -1;
3315
0
    }
3316
0
  }
3317
0
  return 0;
3318
0
}
3319
3320
/*
3321
 * Redirect j->preserved_fds from the parent_fd to the child_fd.
3322
 *
3323
 * NOTE: This will clear FD_CLOEXEC since otherwise the child_fd would not be
3324
 * inherited after the exec call.
3325
 */
3326
static int redirect_fds(struct minijail *j)
3327
0
{
3328
0
  for (size_t i = 0; i < j->preserved_fd_count; i++) {
3329
0
    if (j->preserved_fds[i].parent_fd ==
3330
0
        j->preserved_fds[i].child_fd) {
3331
      // Clear CLOEXEC if it is set so the FD will be
3332
      // inherited by the child.
3333
0
      int flags =
3334
0
          fcntl(j->preserved_fds[i].child_fd, F_GETFD);
3335
0
      if (flags == -1 || (flags & FD_CLOEXEC) == 0) {
3336
0
        continue;
3337
0
      }
3338
3339
      // Currently FD_CLOEXEC is cleared without being
3340
      // restored. It may make sense to track when this
3341
      // happens and restore FD_CLOEXEC in the child process.
3342
0
      flags &= ~FD_CLOEXEC;
3343
0
      if (fcntl(j->preserved_fds[i].child_fd, F_SETFD,
3344
0
          flags) == -1) {
3345
0
        pwarn("failed to clear CLOEXEC for %d",
3346
0
              j->preserved_fds[i].parent_fd);
3347
0
      }
3348
0
      continue;
3349
0
    }
3350
0
    if (dup2(j->preserved_fds[i].parent_fd,
3351
0
       j->preserved_fds[i].child_fd) == -1) {
3352
0
      return -1;
3353
0
    }
3354
0
  }
3355
3356
  /*
3357
   * After all fds have been duped, we are now free to close all parent
3358
   * fds that are *not* child fds.
3359
   */
3360
0
  for (size_t i = 0; i < j->preserved_fd_count; i++) {
3361
0
    int parent_fd = j->preserved_fds[i].parent_fd;
3362
0
    if (!is_preserved_child_fd(j, parent_fd,
3363
0
             j->preserved_fd_count)) {
3364
0
      close(parent_fd);
3365
0
    }
3366
0
  }
3367
0
  return 0;
3368
0
}
3369
3370
static void minijail_free_run_state(struct minijail_run_state *state)
3371
0
{
3372
0
  state->child_pid = -1;
3373
3374
0
  int *fd_pairs[] = {state->pipe_fds, state->stdin_fds, state->stdout_fds,
3375
0
         state->stderr_fds, state->child_sync_pipe_fds};
3376
0
  for (size_t i = 0; i < ARRAY_SIZE(fd_pairs); ++i) {
3377
0
    close_and_reset(&fd_pairs[i][0]);
3378
0
    close_and_reset(&fd_pairs[i][1]);
3379
0
  }
3380
3381
0
  minijail_free_env(state->child_env);
3382
0
  state->child_env = NULL;
3383
0
}
3384
3385
/* Set up stdin/stdout/stderr file descriptors in the child. */
3386
static void setup_child_std_fds(struct minijail *j,
3387
        struct minijail_run_state *state)
3388
0
{
3389
0
  struct {
3390
0
    const char *name;
3391
0
    int from;
3392
0
    int to;
3393
0
  } fd_map[] = {
3394
0
      {"stdin", state->stdin_fds[0], STDIN_FILENO},
3395
0
      {"stdout", state->stdout_fds[1], STDOUT_FILENO},
3396
0
      {"stderr", state->stderr_fds[1], STDERR_FILENO},
3397
0
  };
3398
3399
0
  for (size_t i = 0; i < ARRAY_SIZE(fd_map); ++i) {
3400
0
    if (fd_map[i].from == -1 || fd_map[i].from == fd_map[i].to)
3401
0
      continue;
3402
0
    if (dup2(fd_map[i].from, fd_map[i].to) == -1)
3403
0
      die("failed to set up %s pipe", fd_map[i].name);
3404
0
  }
3405
3406
  /* Close temporary pipe file descriptors. */
3407
0
  int *std_pipes[] = {state->stdin_fds, state->stdout_fds,
3408
0
          state->stderr_fds};
3409
0
  for (size_t i = 0; i < ARRAY_SIZE(std_pipes); ++i) {
3410
0
    close_and_reset(&std_pipes[i][0]);
3411
0
    close_and_reset(&std_pipes[i][1]);
3412
0
  }
3413
3414
  /* Make sure we're not trying to skip setsid() with a PID namespace. */
3415
0
  if (!j->flags.enable_new_sessions && j->flags.pids) {
3416
0
    die("cannot skip setsid() with PID namespace");
3417
0
  }
3418
3419
  /*
3420
   * If new sessions are enabled and any of stdin, stdout, or stderr are
3421
   * TTYs, or setsid flag is set, create a new session. This prevents
3422
   * the jailed process from using the TIOCSTI ioctl to push characters
3423
   * into the parent process terminal's input buffer, therefore escaping
3424
   * the jail.
3425
   *
3426
   * Since it has just forked, the child will not be a process group
3427
   * leader, and this call to setsid() should always succeed.
3428
   */
3429
0
  if (j->flags.enable_new_sessions &&
3430
0
      (j->flags.setsid || isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) ||
3431
0
       isatty(STDERR_FILENO))) {
3432
0
    if (setsid() < 0) {
3433
0
      pdie("setsid() failed");
3434
0
    }
3435
3436
0
    if (isatty(STDIN_FILENO)) {
3437
0
      ioctl(STDIN_FILENO, TIOCSCTTY, 0);
3438
0
    }
3439
0
  }
3440
0
}
3441
3442
/*
3443
 * Structure that specifies how to start a minijail.
3444
 *
3445
 * filename - The program to exec in the child. Should be NULL if elf_fd is set.
3446
 * elf_fd - A fd to be used with fexecve. Should be -1 if filename is set.
3447
 *   NOTE: either filename or elf_fd is required if |exec_in_child| = 1.
3448
 * argv - Arguments for the child program. Required if |exec_in_child| = 1.
3449
 * envp - Environment for the child program. Available if |exec_in_child| = 1.
3450
 * use_preload - If true use LD_PRELOAD.
3451
 * exec_in_child - If true, run |filename|. Otherwise, the child will return to
3452
 *     the caller.
3453
 * pstdin_fd - Filled with stdin pipe if non-NULL.
3454
 * pstdout_fd - Filled with stdout pipe if non-NULL.
3455
 * pstderr_fd - Filled with stderr pipe if non-NULL.
3456
 * pchild_pid - Filled with the pid of the child process if non-NULL.
3457
 */
3458
struct minijail_run_config {
3459
  const char *filename;
3460
  int elf_fd;
3461
  char *const *argv;
3462
  char *const *envp;
3463
  int use_preload;
3464
  int exec_in_child;
3465
  int *pstdin_fd;
3466
  int *pstdout_fd;
3467
  int *pstderr_fd;
3468
  pid_t *pchild_pid;
3469
};
3470
3471
static int
3472
minijail_run_config_internal(struct minijail *j,
3473
           const struct minijail_run_config *config);
3474
3475
int API minijail_run(struct minijail *j, const char *filename,
3476
         char *const argv[])
3477
0
{
3478
0
  struct minijail_run_config config = {
3479
0
      .filename = filename,
3480
0
      .elf_fd = -1,
3481
0
      .argv = argv,
3482
0
      .envp = NULL,
3483
0
      .use_preload = true,
3484
0
      .exec_in_child = true,
3485
0
  };
3486
0
  return minijail_run_config_internal(j, &config);
3487
0
}
3488
3489
int API minijail_run_env(struct minijail *j, const char *filename,
3490
       char *const argv[], char *const envp[])
3491
0
{
3492
0
  struct minijail_run_config config = {
3493
0
      .filename = filename,
3494
0
      .elf_fd = -1,
3495
0
      .argv = argv,
3496
0
      .envp = envp,
3497
0
      .use_preload = true,
3498
0
      .exec_in_child = true,
3499
0
  };
3500
0
  return minijail_run_config_internal(j, &config);
3501
0
}
3502
3503
int API minijail_run_pid(struct minijail *j, const char *filename,
3504
       char *const argv[], pid_t *pchild_pid)
3505
0
{
3506
0
  struct minijail_run_config config = {
3507
0
      .filename = filename,
3508
0
      .elf_fd = -1,
3509
0
      .argv = argv,
3510
0
      .envp = NULL,
3511
0
      .use_preload = true,
3512
0
      .exec_in_child = true,
3513
0
      .pchild_pid = pchild_pid,
3514
0
  };
3515
0
  return minijail_run_config_internal(j, &config);
3516
0
}
3517
3518
int API minijail_run_pipe(struct minijail *j, const char *filename,
3519
        char *const argv[], int *pstdin_fd)
3520
0
{
3521
0
  struct minijail_run_config config = {
3522
0
      .filename = filename,
3523
0
      .elf_fd = -1,
3524
0
      .argv = argv,
3525
0
      .envp = NULL,
3526
0
      .use_preload = true,
3527
0
      .exec_in_child = true,
3528
0
      .pstdin_fd = pstdin_fd,
3529
0
  };
3530
0
  return minijail_run_config_internal(j, &config);
3531
0
}
3532
3533
int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
3534
             char *const argv[], pid_t *pchild_pid,
3535
             int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
3536
0
{
3537
0
  struct minijail_run_config config = {
3538
0
      .filename = filename,
3539
0
      .elf_fd = -1,
3540
0
      .argv = argv,
3541
0
      .envp = NULL,
3542
0
      .use_preload = true,
3543
0
      .exec_in_child = true,
3544
0
      .pstdin_fd = pstdin_fd,
3545
0
      .pstdout_fd = pstdout_fd,
3546
0
      .pstderr_fd = pstderr_fd,
3547
0
      .pchild_pid = pchild_pid,
3548
0
  };
3549
0
  return minijail_run_config_internal(j, &config);
3550
0
}
3551
3552
int API minijail_run_env_pid_pipes(struct minijail *j, const char *filename,
3553
           char *const argv[], char *const envp[],
3554
           pid_t *pchild_pid, int *pstdin_fd,
3555
           int *pstdout_fd, int *pstderr_fd)
3556
0
{
3557
0
  struct minijail_run_config config = {
3558
0
      .filename = filename,
3559
0
      .elf_fd = -1,
3560
0
      .argv = argv,
3561
0
      .envp = envp,
3562
0
      .use_preload = true,
3563
0
      .exec_in_child = true,
3564
0
      .pstdin_fd = pstdin_fd,
3565
0
      .pstdout_fd = pstdout_fd,
3566
0
      .pstderr_fd = pstderr_fd,
3567
0
      .pchild_pid = pchild_pid,
3568
0
  };
3569
0
  return minijail_run_config_internal(j, &config);
3570
0
}
3571
3572
int API minijail_run_fd_env_pid_pipes(struct minijail *j, int elf_fd,
3573
              char *const argv[], char *const envp[],
3574
              pid_t *pchild_pid, int *pstdin_fd,
3575
              int *pstdout_fd, int *pstderr_fd)
3576
0
{
3577
0
  struct minijail_run_config config = {
3578
0
      .filename = NULL,
3579
0
      .elf_fd = elf_fd,
3580
0
      .argv = argv,
3581
0
      .envp = envp,
3582
0
      .use_preload = true,
3583
0
      .exec_in_child = true,
3584
0
      .pstdin_fd = pstdin_fd,
3585
0
      .pstdout_fd = pstdout_fd,
3586
0
      .pstderr_fd = pstderr_fd,
3587
0
      .pchild_pid = pchild_pid,
3588
0
  };
3589
0
  return minijail_run_config_internal(j, &config);
3590
0
}
3591
3592
int API minijail_run_no_preload(struct minijail *j, const char *filename,
3593
        char *const argv[])
3594
0
{
3595
0
  struct minijail_run_config config = {
3596
0
      .filename = filename,
3597
0
      .elf_fd = -1,
3598
0
      .argv = argv,
3599
0
      .envp = NULL,
3600
0
      .use_preload = false,
3601
0
      .exec_in_child = true,
3602
0
  };
3603
0
  return minijail_run_config_internal(j, &config);
3604
0
}
3605
3606
int API minijail_run_pid_pipes_no_preload(struct minijail *j,
3607
            const char *filename,
3608
            char *const argv[], pid_t *pchild_pid,
3609
            int *pstdin_fd, int *pstdout_fd,
3610
            int *pstderr_fd)
3611
0
{
3612
0
  struct minijail_run_config config = {
3613
0
      .filename = filename,
3614
0
      .elf_fd = -1,
3615
0
      .argv = argv,
3616
0
      .envp = NULL,
3617
0
      .use_preload = false,
3618
0
      .exec_in_child = true,
3619
0
      .pstdin_fd = pstdin_fd,
3620
0
      .pstdout_fd = pstdout_fd,
3621
0
      .pstderr_fd = pstderr_fd,
3622
0
      .pchild_pid = pchild_pid,
3623
0
  };
3624
0
  return minijail_run_config_internal(j, &config);
3625
0
}
3626
3627
int API minijail_run_env_pid_pipes_no_preload(struct minijail *j,
3628
                const char *filename,
3629
                char *const argv[],
3630
                char *const envp[],
3631
                pid_t *pchild_pid, int *pstdin_fd,
3632
                int *pstdout_fd, int *pstderr_fd)
3633
0
{
3634
0
  struct minijail_run_config config = {
3635
0
      .filename = filename,
3636
0
      .elf_fd = -1,
3637
0
      .argv = argv,
3638
0
      .envp = envp,
3639
0
      .use_preload = false,
3640
0
      .exec_in_child = true,
3641
0
      .pstdin_fd = pstdin_fd,
3642
0
      .pstdout_fd = pstdout_fd,
3643
0
      .pstderr_fd = pstderr_fd,
3644
0
      .pchild_pid = pchild_pid,
3645
0
  };
3646
0
  return minijail_run_config_internal(j, &config);
3647
0
}
3648
3649
pid_t API minijail_fork(struct minijail *j)
3650
0
{
3651
0
  struct minijail_run_config config = {
3652
0
      .elf_fd = -1,
3653
0
  };
3654
0
  return minijail_run_config_internal(j, &config);
3655
0
}
3656
3657
static int minijail_run_internal(struct minijail *j,
3658
         const struct minijail_run_config *config,
3659
         struct minijail_run_state *state_out)
3660
0
{
3661
0
  int sync_child = 0;
3662
0
  int ret;
3663
  /* We need to remember this across the minijail_preexec() call. */
3664
0
  int pid_namespace = j->flags.pids;
3665
  /*
3666
   * Create an init process if we are entering a pid namespace, unless the
3667
   * user has explicitly opted out by calling minijail_run_as_init().
3668
   */
3669
0
  int do_init = j->flags.do_init && !j->flags.run_as_init;
3670
0
  int use_preload = config->use_preload;
3671
3672
0
  if (config->filename != NULL && config->elf_fd != -1) {
3673
0
    die("filename and elf_fd cannot be set at the same time");
3674
0
  }
3675
0
  if (config->filename != NULL) {
3676
0
    j->filename = strdup(config->filename);
3677
0
  }
3678
3679
  /*
3680
   * Only copy the environment if we need to modify it. If this is done
3681
   * unconditionally, it triggers odd behavior in the ARC container.
3682
   */
3683
0
  if (use_preload || j->seccomp_policy_path) {
3684
0
    state_out->child_env =
3685
0
        minijail_copy_env(config->envp ? config->envp : environ);
3686
0
    if (!state_out->child_env)
3687
0
      return ENOMEM;
3688
0
  }
3689
3690
0
  if (j->seccomp_policy_path &&
3691
0
      setup_seccomp_policy_path(j, &state_out->child_env))
3692
0
    return -EFAULT;
3693
3694
0
  if (use_preload) {
3695
0
    if (j->hooks_head != NULL)
3696
0
      die("Minijail hooks are not supported with LD_PRELOAD");
3697
0
    if (!config->exec_in_child)
3698
0
      die("minijail_fork is not supported with LD_PRELOAD");
3699
3700
    /*
3701
     * Before we fork(2) and execve(2) the child process, we need
3702
     * to open a pipe(2) to send the minijail configuration over.
3703
     */
3704
0
    if (setup_preload(j, &state_out->child_env) ||
3705
0
        setup_pipe(&state_out->child_env, state_out->pipe_fds))
3706
0
      return -EFAULT;
3707
0
  } else {
3708
0
    if (j->flags.use_caps && j->caps != 0 &&
3709
0
        !j->flags.set_ambient_caps) {
3710
0
      die("non-empty, non-ambient capabilities are not "
3711
0
          "supported without LD_PRELOAD");
3712
0
    }
3713
0
  }
3714
3715
  /* Create pipes for stdin/stdout/stderr as requested by caller. */
3716
0
  struct {
3717
0
    bool requested;
3718
0
    int *pipe_fds;
3719
0
  } pipe_fd_req[] = {
3720
0
      {config->pstdin_fd != NULL, state_out->stdin_fds},
3721
0
      {config->pstdout_fd != NULL, state_out->stdout_fds},
3722
0
      {config->pstderr_fd != NULL, state_out->stderr_fds},
3723
0
  };
3724
3725
0
  for (size_t i = 0; i < ARRAY_SIZE(pipe_fd_req); ++i) {
3726
0
    if (pipe_fd_req[i].requested &&
3727
0
        pipe(pipe_fd_req[i].pipe_fds) == -1)
3728
0
      return EFAULT;
3729
0
  }
3730
3731
  /*
3732
   * If the parent process needs to configure the child's runtime
3733
   * environment after forking, create a pipe(2) to block the child until
3734
   * configuration is done.
3735
   */
3736
0
  if (j->flags.forward_signals || j->flags.pid_file || j->flags.cgroups ||
3737
0
      j->rlimit_count || j->flags.userns) {
3738
0
    sync_child = 1;
3739
0
    if (pipe(state_out->child_sync_pipe_fds))
3740
0
      return -EFAULT;
3741
0
  }
3742
3743
  /*
3744
   * Use sys_clone() if and only if we're creating a pid namespace.
3745
   *
3746
   * tl;dr: WARNING: do not mix pid namespaces and multithreading.
3747
   *
3748
   * In multithreaded programs, there are a bunch of locks inside libc,
3749
   * some of which may be held by other threads at the time that we call
3750
   * minijail_run_pid(). If we call fork(), glibc does its level best to
3751
   * ensure that we hold all of these locks before it calls clone()
3752
   * internally and drop them after clone() returns, but when we call
3753
   * sys_clone(2) directly, all that gets bypassed and we end up with a
3754
   * child address space where some of libc's important locks are held by
3755
   * other threads (which did not get cloned, and hence will never release
3756
   * those locks). This is okay so long as we call exec() immediately
3757
   * after, but a bunch of seemingly-innocent libc functions like setenv()
3758
   * take locks.
3759
   *
3760
   * Hence, only call sys_clone() if we need to, in order to get at pid
3761
   * namespacing. If we follow this path, the child's address space might
3762
   * have broken locks; you may only call functions that do not acquire
3763
   * any locks.
3764
   *
3765
   * Unfortunately, fork() acquires every lock it can get its hands on, as
3766
   * previously detailed, so this function is highly likely to deadlock
3767
   * later on (see "deadlock here") if we're multithreaded.
3768
   *
3769
   * We might hack around this by having the clone()d child (init of the
3770
   * pid namespace) return directly, rather than leaving the clone()d
3771
   * process hanging around to be init for the new namespace (and having
3772
   * its fork()ed child return in turn), but that process would be
3773
   * crippled with its libc locks potentially broken. We might try
3774
   * fork()ing in the parent before we clone() to ensure that we own all
3775
   * the locks, but then we have to have the forked child hanging around
3776
   * consuming resources (and possibly having file descriptors / shared
3777
   * memory regions / etc attached). We'd need to keep the child around to
3778
   * avoid having its children get reparented to init.
3779
   *
3780
   * TODO(b/317404364): figure out if the "forked child hanging around"
3781
   * problem is fixable or not. It would be nice if we worked in this
3782
   * case.
3783
   */
3784
0
  pid_t child_pid;
3785
0
  if (pid_namespace) {
3786
0
    unsigned long clone_flags = CLONE_NEWPID | SIGCHLD;
3787
0
    if (j->flags.userns)
3788
0
      clone_flags |= CLONE_NEWUSER;
3789
3790
0
    child_pid = syscall(SYS_clone, clone_flags, NULL, 0L, 0L, 0L);
3791
3792
0
    if (child_pid < 0) {
3793
0
      if (errno == EPERM)
3794
0
        pdie("clone(CLONE_NEWPID | ...) failed with "
3795
0
             "EPERM; is this process missing "
3796
0
             "CAP_SYS_ADMIN?");
3797
0
      pdie("clone(CLONE_NEWPID | ...) failed");
3798
0
    }
3799
0
  } else {
3800
0
    if (j->flags.userns)
3801
0
      die("user namespaces in Minijail require a PID "
3802
0
          "namespace");
3803
3804
0
    child_pid = fork();
3805
3806
0
    if (child_pid < 0)
3807
0
      pdie("fork failed");
3808
0
  }
3809
3810
  /*
3811
   * setup_fs_rules_fd() needs to be called before close_open_fds(), and
3812
   * before logic for the child process.
3813
   */
3814
0
  if (j->fs_rules_head) {
3815
0
    setup_fs_rules_fd(j);
3816
0
    minijail_preserve_fd(j, j->fs_rules_fd, j->fs_rules_fd);
3817
0
  }
3818
3819
0
  state_out->child_pid = child_pid;
3820
0
  if (child_pid) {
3821
0
    j->initpid = child_pid;
3822
3823
0
    if (j->flags.forward_signals) {
3824
0
      forward_pid = child_pid;
3825
0
      install_signal_handlers();
3826
0
    }
3827
3828
0
    if (j->flags.pid_file)
3829
0
      write_pid_file_or_die(j);
3830
3831
0
    if (j->flags.cgroups)
3832
0
      add_to_cgroups_or_die(j);
3833
3834
0
    if (j->rlimit_count)
3835
0
      set_rlimits_or_die(j);
3836
3837
0
    if (j->flags.userns)
3838
0
      write_ugid_maps_or_die(j);
3839
3840
0
    if (j->flags.enter_vfs)
3841
0
      close(j->mountns_fd);
3842
3843
0
    if (j->flags.enter_net)
3844
0
      close(j->netns_fd);
3845
3846
0
    if (sync_child)
3847
0
      parent_setup_complete(state_out->child_sync_pipe_fds);
3848
3849
0
    if (use_preload) {
3850
      /*
3851
       * Add SIGPIPE to the signal mask to avoid getting
3852
       * killed if the child process finishes or closes its
3853
       * end of the pipe prematurely.
3854
       *
3855
       * TODO(crbug.com/1022170): Use pthread_sigmask instead
3856
       * of sigprocmask if Minijail is used in multithreaded
3857
       * programs.
3858
       */
3859
0
      sigset_t to_block, to_restore;
3860
0
      if (sigemptyset(&to_block) < 0)
3861
0
        pdie("sigemptyset failed");
3862
0
      if (sigaddset(&to_block, SIGPIPE) < 0)
3863
0
        pdie("sigaddset failed");
3864
0
      if (sigprocmask(SIG_BLOCK, &to_block, &to_restore) < 0)
3865
0
        pdie("sigprocmask failed");
3866
3867
      /* Send marshalled minijail. */
3868
0
      close_and_reset(&state_out->pipe_fds[0]);
3869
0
      ret = minijail_to_fd(j, state_out->pipe_fds[1]);
3870
0
      close_and_reset(&state_out->pipe_fds[1]);
3871
3872
      /* Accept any pending SIGPIPE. */
3873
0
      while (true) {
3874
0
        const struct timespec zero_time = {0, 0};
3875
0
        const int sig =
3876
0
            sigtimedwait(&to_block, NULL, &zero_time);
3877
0
        if (sig < 0) {
3878
0
          if (errno != EINTR)
3879
0
            break;
3880
0
        } else {
3881
0
          if (sig != SIGPIPE)
3882
0
            die("unexpected signal %d",
3883
0
                sig);
3884
0
        }
3885
0
      }
3886
3887
      /* Restore the signal mask to its original state. */
3888
0
      if (sigprocmask(SIG_SETMASK, &to_restore, NULL) < 0)
3889
0
        pdie("sigprocmask failed");
3890
3891
0
      if (ret) {
3892
0
        warn("failed to send marshalled minijail: %s",
3893
0
             strerror(-ret));
3894
0
        kill(j->initpid, SIGKILL);
3895
0
      }
3896
0
    }
3897
3898
0
    return 0;
3899
0
  }
3900
3901
  /* Child process. */
3902
0
  if (j->flags.reset_signal_mask) {
3903
0
    sigset_t signal_mask;
3904
0
    if (sigemptyset(&signal_mask) != 0)
3905
0
      pdie("sigemptyset failed");
3906
0
    if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
3907
0
      pdie("sigprocmask failed");
3908
0
  }
3909
3910
0
  if (j->flags.reset_signal_handlers) {
3911
0
    int signum;
3912
0
    for (signum = 0; signum <= SIGRTMAX; signum++) {
3913
      /*
3914
       * Ignore EINVAL since some signal numbers in the range
3915
       * might not be valid.
3916
       */
3917
0
      if (signal(signum, SIG_DFL) == SIG_ERR &&
3918
0
          errno != EINVAL) {
3919
0
        pdie("failed to reset signal %d disposition",
3920
0
             signum);
3921
0
      }
3922
0
    }
3923
0
  }
3924
3925
0
  if (j->flags.close_open_fds) {
3926
0
    const size_t kMaxInheritableFdsSize = 11 + MAX_PRESERVED_FDS;
3927
0
    int inheritable_fds[kMaxInheritableFdsSize];
3928
0
    size_t size = 0;
3929
3930
0
    int *pipe_fds[] = {
3931
0
        state_out->pipe_fds,   state_out->child_sync_pipe_fds,
3932
0
        state_out->stdin_fds,  state_out->stdout_fds,
3933
0
        state_out->stderr_fds,
3934
0
    };
3935
3936
0
    for (size_t i = 0; i < ARRAY_SIZE(pipe_fds); ++i) {
3937
0
      if (pipe_fds[i][0] != -1) {
3938
0
        inheritable_fds[size++] = pipe_fds[i][0];
3939
0
      }
3940
0
      if (pipe_fds[i][1] != -1) {
3941
0
        inheritable_fds[size++] = pipe_fds[i][1];
3942
0
      }
3943
0
    }
3944
3945
    /*
3946
     * Preserve namespace file descriptors over the close_open_fds()
3947
     * call. These are closed in minijail_enter() so they won't leak
3948
     * into the child process.
3949
     */
3950
0
    if (j->flags.enter_vfs)
3951
0
      minijail_preserve_fd(j, j->mountns_fd, j->mountns_fd);
3952
0
    if (j->flags.enter_net)
3953
0
      minijail_preserve_fd(j, j->netns_fd, j->netns_fd);
3954
3955
0
    for (size_t i = 0; i < j->preserved_fd_count; i++) {
3956
      /*
3957
       * Preserve all parent_fds. They will be dup2(2)-ed in
3958
       * the child later.
3959
       */
3960
0
      inheritable_fds[size++] = j->preserved_fds[i].parent_fd;
3961
0
    }
3962
3963
0
    if (config->elf_fd > -1) {
3964
0
      inheritable_fds[size++] = config->elf_fd;
3965
0
    }
3966
3967
0
    if (close_open_fds(inheritable_fds, size) < 0)
3968
0
      die("failed to close open file descriptors");
3969
0
  }
3970
3971
  /* The set of fds will be replaced. */
3972
0
  if (prepare_preserved_fds(j))
3973
0
    die("failed to set up fd redirections");
3974
3975
0
  if (avoid_pipe_conflicts(j, state_out))
3976
0
    die("failed to redirect conflicting pipes");
3977
3978
  /* The elf_fd needs to be mutable so use a stack copy from now on. */
3979
0
  int elf_fd = config->elf_fd;
3980
0
  if (elf_fd != -1 &&
3981
0
      ensure_no_fd_conflict(j, -1, &elf_fd, j->preserved_fd_count))
3982
0
    die("failed to redirect elf_fd");
3983
3984
0
  if (redirect_fds(j))
3985
0
    die("failed to set up fd redirections");
3986
3987
0
  if (sync_child)
3988
0
    wait_for_parent_setup(state_out->child_sync_pipe_fds);
3989
3990
0
  if (j->flags.userns)
3991
0
    enter_user_namespace(j);
3992
3993
0
  setup_child_std_fds(j, state_out);
3994
3995
  /* If running an init program, let it decide when/how to mount /proc. */
3996
0
  if (pid_namespace && !do_init)
3997
0
    j->flags.remount_proc_ro = 0;
3998
3999
0
  if (use_preload) {
4000
    /* Strip out flags that cannot be inherited across execve(2). */
4001
0
    minijail_preexec(j);
4002
0
  } else {
4003
    /*
4004
     * If not using LD_PRELOAD, do all jailing before execve(2).
4005
     * Note that PID namespaces can only be entered on fork(2),
4006
     * so that flag is still cleared.
4007
     */
4008
0
    j->flags.pids = 0;
4009
0
  }
4010
4011
  /*
4012
   * Jail this process.
4013
   * If forking, return.
4014
   * If not, execve(2) the target.
4015
   */
4016
0
  minijail_enter(j);
4017
4018
0
  if (config->exec_in_child && pid_namespace && do_init) {
4019
    /*
4020
     * pid namespace: this process will become init inside the new
4021
     * namespace. We don't want all programs we might exec to have
4022
     * to know how to be init. Normally (do_init == 1) we fork off
4023
     * a child to actually run the program. If |do_init == 0|, we
4024
     * let the program keep pid 1 and be init.
4025
     *
4026
     * If we're multithreaded, we'll probably deadlock here. See
4027
     * WARNING above.
4028
     */
4029
0
    child_pid = fork();
4030
0
    if (child_pid < 0) {
4031
0
      _exit(child_pid);
4032
0
    } else if (child_pid > 0) {
4033
0
      minijail_free_run_state(state_out);
4034
4035
      /*
4036
       * Best effort. Don't bother checking the return value.
4037
       */
4038
0
      prctl(PR_SET_NAME, "minijail-init");
4039
0
      init(child_pid); /* Never returns. */
4040
0
    }
4041
0
    state_out->child_pid = child_pid;
4042
0
  }
4043
4044
0
  run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_EXECVE);
4045
4046
0
  if (!config->exec_in_child)
4047
0
    return 0;
4048
4049
  /*
4050
   * We're going to execve(), so make sure any remaining resources are
4051
   * freed. Exceptions are:
4052
   *  1. The child environment. No need to worry about freeing it since
4053
   *     execve reinitializes the heap anyways.
4054
   *  2. The read side of the LD_PRELOAD pipe, which we need to hand down
4055
   *     into the target in which the preloaded code will read from it and
4056
   *     then close it.
4057
   */
4058
0
  state_out->pipe_fds[0] = -1;
4059
0
  char *const *child_env = state_out->child_env;
4060
0
  state_out->child_env = NULL;
4061
0
  minijail_free_run_state(state_out);
4062
4063
  /*
4064
   * If we aren't pid-namespaced, or the jailed program asked to be init:
4065
   *   calling process
4066
   *   -> execve()-ing process
4067
   * If we are:
4068
   *   calling process
4069
   *   -> init()-ing process
4070
   *      -> execve()-ing process
4071
   */
4072
0
  if (!child_env)
4073
0
    child_env = config->envp ? config->envp : environ;
4074
0
  if (elf_fd > -1) {
4075
0
    fexecve(elf_fd, config->argv, child_env);
4076
0
    pwarn("fexecve(%d) failed", config->elf_fd);
4077
0
  } else {
4078
0
    execve(config->filename, config->argv, child_env);
4079
0
    pwarn("execve(%s) failed", config->filename);
4080
0
  }
4081
4082
0
  ret = (errno == ENOENT ? MINIJAIL_ERR_NO_COMMAND
4083
0
             : MINIJAIL_ERR_NO_ACCESS);
4084
0
  _exit(ret);
4085
0
}
4086
4087
static int
4088
minijail_run_config_internal(struct minijail *j,
4089
           const struct minijail_run_config *config)
4090
0
{
4091
0
  struct minijail_run_state state = {
4092
0
      .child_pid = -1,
4093
0
      .pipe_fds = {-1, -1},
4094
0
      .stdin_fds = {-1, -1},
4095
0
      .stdout_fds = {-1, -1},
4096
0
      .stderr_fds = {-1, -1},
4097
0
      .child_sync_pipe_fds = {-1, -1},
4098
0
      .child_env = NULL,
4099
0
  };
4100
0
  int ret = minijail_run_internal(j, config, &state);
4101
4102
0
  if (ret == 0) {
4103
0
    if (config->pchild_pid)
4104
0
      *config->pchild_pid = state.child_pid;
4105
4106
    /* Grab stdin/stdout/stderr descriptors requested by caller. */
4107
0
    struct {
4108
0
      int *pfd;
4109
0
      int *psrc;
4110
0
    } fd_map[] = {
4111
0
        {config->pstdin_fd, &state.stdin_fds[1]},
4112
0
        {config->pstdout_fd, &state.stdout_fds[0]},
4113
0
        {config->pstderr_fd, &state.stderr_fds[0]},
4114
0
    };
4115
4116
0
    for (size_t i = 0; i < ARRAY_SIZE(fd_map); ++i) {
4117
0
      if (fd_map[i].pfd) {
4118
0
        *fd_map[i].pfd = *fd_map[i].psrc;
4119
0
        *fd_map[i].psrc = -1;
4120
0
      }
4121
0
    }
4122
4123
0
    if (!config->exec_in_child)
4124
0
      ret = state.child_pid;
4125
0
  }
4126
4127
0
  minijail_free_run_state(&state);
4128
4129
0
  return ret;
4130
0
}
4131
4132
static int minijail_wait_internal(struct minijail *j, int expected_signal)
4133
0
{
4134
0
  if (j->initpid <= 0)
4135
0
    return -ECHILD;
4136
4137
0
  int st;
4138
0
  while (true) {
4139
0
    const int ret = waitpid(j->initpid, &st, 0);
4140
0
    if (ret >= 0)
4141
0
      break;
4142
0
    if (errno != EINTR)
4143
0
      return -errno;
4144
0
  }
4145
4146
0
  if (!WIFEXITED(st)) {
4147
0
    int error_status = st;
4148
0
    if (!WIFSIGNALED(st)) {
4149
0
      return error_status;
4150
0
    }
4151
4152
0
    int signum = WTERMSIG(st);
4153
    /*
4154
     * We return MINIJAIL_ERR_SECCOMP_VIOLATION if the process
4155
     * received SIGSYS, which happens when a syscall is blocked by
4156
     * SECCOMP filters.
4157
     *
4158
     * If not, we do what bash(1) does: $? = 128 + signum
4159
     */
4160
0
    if (signum == SIGSYS) {
4161
0
      warn("child process %d had a policy violation (%s)",
4162
0
           j->initpid,
4163
0
           j->seccomp_policy_path ? j->seccomp_policy_path
4164
0
                : "NO-LABEL");
4165
0
      error_status = MINIJAIL_ERR_SECCOMP_VIOLATION;
4166
0
    } else {
4167
0
      if (signum != expected_signal) {
4168
0
        warn("child process %d received signal %d",
4169
0
             j->initpid, signum);
4170
0
      }
4171
0
      error_status = MINIJAIL_ERR_SIG_BASE + signum;
4172
0
    }
4173
0
    return error_status;
4174
0
  }
4175
4176
0
  int exit_status = WEXITSTATUS(st);
4177
0
  if (exit_status != 0)
4178
0
    info("child process %d exited with status %d", j->initpid,
4179
0
         exit_status);
4180
4181
0
  return exit_status;
4182
0
}
4183
4184
int API minijail_kill(struct minijail *j)
4185
0
{
4186
0
  if (j->initpid <= 0)
4187
0
    return -ECHILD;
4188
4189
0
  if (kill(j->initpid, SIGTERM))
4190
0
    return -errno;
4191
4192
0
  return minijail_wait_internal(j, SIGTERM);
4193
0
}
4194
4195
int API minijail_wait(struct minijail *j)
4196
0
{
4197
0
  return minijail_wait_internal(j, 0);
4198
0
}
4199
4200
void API minijail_destroy(struct minijail *j)
4201
0
{
4202
0
  size_t i;
4203
4204
0
  if (j->filter_prog) {
4205
0
    free(j->filter_prog->filter);
4206
0
    free(j->filter_prog);
4207
0
  }
4208
0
  free_mounts_list(j);
4209
0
  free_remounts_list(j);
4210
0
  while (j->hooks_head) {
4211
0
    struct hook *c = j->hooks_head;
4212
0
    j->hooks_head = c->next;
4213
0
    free(c);
4214
0
  }
4215
0
  j->hooks_tail = NULL;
4216
0
  free_fs_rules_list(j);
4217
0
  if (j->user)
4218
0
    free(j->user);
4219
0
  if (j->suppl_gid_list)
4220
0
    free(j->suppl_gid_list);
4221
0
  if (j->chrootdir)
4222
0
    free(j->chrootdir);
4223
0
  if (j->pid_file_path)
4224
0
    free(j->pid_file_path);
4225
0
  if (j->uidmap)
4226
0
    free(j->uidmap);
4227
0
  if (j->gidmap)
4228
0
    free(j->gidmap);
4229
0
  if (j->hostname)
4230
0
    free(j->hostname);
4231
0
  if (j->preload_path)
4232
0
    free(j->preload_path);
4233
0
  if (j->filename)
4234
0
    free(j->filename);
4235
0
  if (j->alt_syscall_table)
4236
0
    free(j->alt_syscall_table);
4237
0
  for (i = 0; i < j->cgroup_count; ++i)
4238
0
    free(j->cgroups[i]);
4239
0
  if (j->seccomp_policy_path)
4240
0
    free(j->seccomp_policy_path);
4241
0
  free(j);
4242
0
}
4243
4244
void API minijail_log_to_fd(int fd, int min_priority)
4245
0
{
4246
0
  init_logging(LOG_TO_FD, fd, min_priority);
4247
0
}
4248
4249
const char API *minijail_syscall_name(const struct minijail *j, long nr)
4250
0
{
4251
0
  if (j && j->flags.alt_syscall)
4252
0
    return kAltSyscallNamePlaceholder;
4253
0
  return lookup_syscall_name(nr);
4254
0
}